1 /* Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2 
3    This library is free software; you can redistribute it and/or
4    modify it under the terms of the GNU Library General Public
5    License as published by the Free Software Foundation; version 2
6    of the License.
7 
8    This library is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11    Library General Public License for more details.
12 
13    You should have received a copy of the GNU Library General Public
14    License along with this library; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
16 
17 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
18 
19 #include <my_global.h>
20 #include <my_sys.h>
21 #include "m_string.h"
22 #include "m_ctype.h"
23 #include <errno.h>
24 #include <stdarg.h>
25 
26 
27 #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
28 #define HAVE_CHARSET_mb2
29 #endif
30 
31 
32 #if defined(HAVE_CHARSET_mb2) || defined(HAVE_CHARSET_utf32)
33 #define HAVE_CHARSET_mb2_or_mb4
34 #endif
35 
36 
37 #ifndef EILSEQ
38 #define EILSEQ ENOENT
39 #endif
40 
41 #define ULONGLONG_MAX                (~(ulonglong) 0)
42 #define MAX_NEGATIVE_NUMBER        ((ulonglong) 0x8000000000000000LL)
43 #define INIT_CNT  9
44 #define LFACTOR   1000000000ULL
45 #define LFACTOR1  10000000000ULL
46 #define LFACTOR2  100000000000ULL
47 
48 #ifdef HAVE_CHARSET_mb2_or_mb4
49 static unsigned long lfactor[9]=
50 { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L };
51 
52 static inline int
my_bincmp(const uchar * s,const uchar * se,const uchar * t,const uchar * te)53 my_bincmp(const uchar *s, const uchar *se,
54           const uchar *t, const uchar *te)
55 {
56   int slen= (int) (se - s), tlen= (int) (te - t);
57   int len= MY_MIN(slen, tlen);
58   int cmp= memcmp(s, t, len);
59   return cmp ? cmp : slen - tlen;
60 }
61 
62 
63 static size_t
my_caseup_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s MY_ATTRIBUTE ((unused)))64 my_caseup_str_mb2_or_mb4(const CHARSET_INFO * cs  MY_ATTRIBUTE((unused)),
65                          char * s MY_ATTRIBUTE((unused)))
66 {
67   assert(0);
68   return 0;
69 }
70 
71 
72 static size_t
my_casedn_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s MY_ATTRIBUTE ((unused)))73 my_casedn_str_mb2_or_mb4(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
74                          char * s MY_ATTRIBUTE((unused)))
75 {
76   assert(0);
77   return 0;
78 }
79 
80 
81 static int
my_strcasecmp_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * s MY_ATTRIBUTE ((unused)),const char * t MY_ATTRIBUTE ((unused)))82 my_strcasecmp_mb2_or_mb4(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
83                          const char *s MY_ATTRIBUTE((unused)),
84                          const char *t MY_ATTRIBUTE((unused)))
85 {
86   assert(0);
87   return 0;
88 }
89 
90 
91 static long
my_strntol_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)92 my_strntol_mb2_or_mb4(const CHARSET_INFO *cs,
93                       const char *nptr, size_t l, int base,
94                       char **endptr, int *err)
95 {
96   int      negative= 0;
97   int      overflow;
98   int      cnv;
99   my_wc_t  wc;
100   unsigned int cutlim;
101   uint32 cutoff;
102   uint32 res;
103   const uchar *s= (const uchar*) nptr;
104   const uchar *e= (const uchar*) nptr+l;
105   const uchar *save;
106 
107   *err= 0;
108   do
109   {
110     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e))>0)
111     {
112       switch (wc)
113       {
114         case ' ' : break;
115         case '\t': break;
116         case '-' : negative= !negative; break;
117         case '+' : break;
118         default  : goto bs;
119       }
120     }
121     else /* No more characters or bad multibyte sequence */
122     {
123       if (endptr != NULL )
124         *endptr= (char*) s;
125       err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
126       return 0;
127     }
128     s+= cnv;
129   } while (1);
130 
131 bs:
132 
133   overflow= 0;
134   res= 0;
135   save= s;
136   cutoff= ((uint32)~0L) / (uint32) base;
137   cutlim= (uint) (((uint32)~0L) % (uint32) base);
138 
139   do {
140     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
141     {
142       s+= cnv;
143       if (wc >= '0' && wc <= '9')
144         wc-= '0';
145       else if (wc >= 'A' && wc <= 'Z')
146         wc= wc - 'A' + 10;
147       else if (wc >= 'a' && wc <= 'z')
148         wc= wc - 'a' + 10;
149       else
150         break;
151       if ((int)wc >= base)
152         break;
153       if (res > cutoff || (res == cutoff && wc > cutlim))
154         overflow= 1;
155       else
156       {
157         res*= (uint32) base;
158         res+= wc;
159       }
160     }
161     else if (cnv == MY_CS_ILSEQ)
162     {
163       if (endptr !=NULL )
164         *endptr = (char*) s;
165       err[0]= EILSEQ;
166       return 0;
167     }
168     else
169     {
170       /* No more characters */
171       break;
172     }
173   } while(1);
174 
175   if (endptr != NULL)
176     *endptr = (char *) s;
177 
178   if (s == save)
179   {
180     err[0]= EDOM;
181     return 0L;
182   }
183 
184   if (negative)
185   {
186     if (res > (uint32) INT_MIN32)
187       overflow= 1;
188   }
189   else if (res > INT_MAX32)
190     overflow= 1;
191 
192   if (overflow)
193   {
194     err[0]= ERANGE;
195     return negative ? INT_MIN32 : INT_MAX32;
196   }
197 
198   return (negative ? -((long) res) : (long) res);
199 }
200 
201 
202 static ulong
my_strntoul_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)203 my_strntoul_mb2_or_mb4(const CHARSET_INFO *cs,
204                        const char *nptr, size_t l, int base,
205                        char **endptr, int *err)
206 {
207   int      negative= 0;
208   int      overflow;
209   int      cnv;
210   my_wc_t  wc;
211   unsigned int cutlim;
212   uint32 cutoff;
213   uint32 res;
214   const uchar *s= (const uchar*) nptr;
215   const uchar *e= (const uchar*) nptr + l;
216   const uchar *save;
217 
218   *err= 0;
219   do
220   {
221     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
222     {
223       switch (wc)
224       {
225         case ' ' : break;
226         case '\t': break;
227         case '-' : negative= !negative; break;
228         case '+' : break;
229         default  : goto bs;
230       }
231     }
232     else /* No more characters or bad multibyte sequence */
233     {
234       if (endptr !=NULL )
235         *endptr= (char*)s;
236       err[0]= (cnv == MY_CS_ILSEQ) ? EILSEQ : EDOM;
237       return 0;
238     }
239     s+= cnv;
240   } while (1);
241 
242 bs:
243 
244   overflow= 0;
245   res= 0;
246   save= s;
247   cutoff= ((uint32)~0L) / (uint32) base;
248   cutlim= (uint) (((uint32)~0L) % (uint32) base);
249 
250   do
251   {
252     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
253     {
254       s+= cnv;
255       if (wc >= '0' && wc <= '9')
256         wc-= '0';
257       else if (wc >= 'A' && wc <= 'Z')
258         wc= wc - 'A' + 10;
259       else if (wc >= 'a' && wc <= 'z')
260         wc= wc - 'a' + 10;
261       else
262         break;
263       if ((int) wc >= base)
264         break;
265       if (res > cutoff || (res == cutoff && wc > cutlim))
266         overflow = 1;
267       else
268       {
269         res*= (uint32) base;
270         res+= wc;
271       }
272     }
273     else if (cnv == MY_CS_ILSEQ)
274     {
275       if (endptr != NULL )
276         *endptr= (char*)s;
277       err[0]= EILSEQ;
278       return 0;
279     }
280     else
281     {
282       /* No more characters */
283       break;
284     }
285   } while(1);
286 
287   if (endptr != NULL)
288     *endptr= (char *) s;
289 
290   if (s == save)
291   {
292     err[0]= EDOM;
293     return 0L;
294   }
295 
296   if (overflow)
297   {
298     err[0]= (ERANGE);
299     return (~(uint32) 0);
300   }
301 
302   return (negative ? -((long) res) : (long) res);
303 }
304 
305 
306 static longlong
my_strntoll_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)307 my_strntoll_mb2_or_mb4(const CHARSET_INFO *cs,
308                        const char *nptr, size_t l, int base,
309                        char **endptr, int *err)
310 {
311   int      negative=0;
312   int      overflow;
313   int      cnv;
314   my_wc_t  wc;
315   ulonglong    cutoff;
316   unsigned int cutlim;
317   ulonglong    res;
318   const uchar *s= (const uchar*) nptr;
319   const uchar *e= (const uchar*) nptr+l;
320   const uchar *save;
321 
322   *err= 0;
323   do
324   {
325     if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
326     {
327       switch (wc)
328       {
329         case ' ' : break;
330         case '\t': break;
331         case '-' : negative= !negative; break;
332         case '+' : break;
333         default  : goto bs;
334       }
335     }
336     else /* No more characters or bad multibyte sequence */
337     {
338       if (endptr !=NULL )
339         *endptr = (char*)s;
340       err[0] = (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
341       return 0;
342     }
343     s+=cnv;
344   } while (1);
345 
346 bs:
347 
348   overflow = 0;
349   res = 0;
350   save = s;
351   cutoff = (~(ulonglong) 0) / (unsigned long int) base;
352   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
353 
354   do {
355     if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
356     {
357       s+=cnv;
358       if ( wc>='0' && wc<='9')
359         wc -= '0';
360       else if ( wc>='A' && wc<='Z')
361         wc = wc - 'A' + 10;
362       else if ( wc>='a' && wc<='z')
363         wc = wc - 'a' + 10;
364       else
365         break;
366       if ((int)wc >= base)
367         break;
368       if (res > cutoff || (res == cutoff && wc > cutlim))
369         overflow = 1;
370       else
371       {
372         res *= (ulonglong) base;
373         res += wc;
374       }
375     }
376     else if (cnv==MY_CS_ILSEQ)
377     {
378       if (endptr !=NULL )
379         *endptr = (char*)s;
380       err[0]=EILSEQ;
381       return 0;
382     }
383     else
384     {
385       /* No more characters */
386       break;
387     }
388   } while(1);
389 
390   if (endptr != NULL)
391     *endptr = (char *) s;
392 
393   if (s == save)
394   {
395     err[0]=EDOM;
396     return 0L;
397   }
398 
399   if (negative)
400   {
401     if (res  > (ulonglong) LLONG_MIN)
402       overflow = 1;
403   }
404   else if (res > (ulonglong) LLONG_MAX)
405     overflow = 1;
406 
407   if (overflow)
408   {
409     err[0]=ERANGE;
410     return negative ? LLONG_MIN : LLONG_MAX;
411   }
412 
413   return (negative ? -((longlong)res) : (longlong)res);
414 }
415 
416 
417 static ulonglong
my_strntoull_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)418 my_strntoull_mb2_or_mb4(const CHARSET_INFO *cs,
419                         const char *nptr, size_t l, int base,
420                         char **endptr, int *err)
421 {
422   int      negative= 0;
423   int      overflow;
424   int      cnv;
425   my_wc_t  wc;
426   ulonglong    cutoff;
427   unsigned int cutlim;
428   ulonglong    res;
429   const uchar *s= (const uchar*) nptr;
430   const uchar *e= (const uchar*) nptr + l;
431   const uchar *save;
432 
433   *err= 0;
434   do
435   {
436     if ((cnv= cs->cset->mb_wc(cs,&wc,s,e)) > 0)
437     {
438       switch (wc)
439       {
440         case ' ' : break;
441         case '\t': break;
442         case '-' : negative= !negative; break;
443         case '+' : break;
444         default  : goto bs;
445       }
446     }
447     else /* No more characters or bad multibyte sequence */
448     {
449       if (endptr !=NULL )
450         *endptr = (char*)s;
451       err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
452       return 0;
453     }
454     s+=cnv;
455   } while (1);
456 
457 bs:
458 
459   overflow = 0;
460   res = 0;
461   save = s;
462   cutoff = (~(ulonglong) 0) / (unsigned long int) base;
463   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
464 
465   do
466   {
467     if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
468     {
469       s+=cnv;
470       if ( wc>='0' && wc<='9')
471         wc -= '0';
472       else if ( wc>='A' && wc<='Z')
473         wc = wc - 'A' + 10;
474       else if ( wc>='a' && wc<='z')
475         wc = wc - 'a' + 10;
476       else
477         break;
478       if ((int)wc >= base)
479         break;
480       if (res > cutoff || (res == cutoff && wc > cutlim))
481         overflow = 1;
482       else
483       {
484         res *= (ulonglong) base;
485         res += wc;
486       }
487     }
488     else if (cnv==MY_CS_ILSEQ)
489     {
490       if (endptr !=NULL )
491         *endptr = (char*)s;
492       err[0]= EILSEQ;
493       return 0;
494     }
495     else
496     {
497       /* No more characters */
498       break;
499     }
500   } while(1);
501 
502   if (endptr != NULL)
503     *endptr = (char *) s;
504 
505   if (s == save)
506   {
507     err[0]= EDOM;
508     return 0L;
509   }
510 
511   if (overflow)
512   {
513     err[0]= ERANGE;
514     return (~(ulonglong) 0);
515   }
516 
517   return (negative ? -((longlong) res) : (longlong) res);
518 }
519 
520 
521 static double
my_strntod_mb2_or_mb4(const CHARSET_INFO * cs,char * nptr,size_t length,char ** endptr,int * err)522 my_strntod_mb2_or_mb4(const CHARSET_INFO *cs,
523                       char *nptr, size_t length,
524                       char **endptr, int *err)
525 {
526   char     buf[256];
527   double   res;
528   char *b= buf;
529   const uchar *s= (const uchar*) nptr;
530   const uchar *end;
531   my_wc_t  wc;
532   int     cnv;
533 
534   *err= 0;
535   /* Cut too long strings */
536   if (length >= sizeof(buf))
537     length= sizeof(buf) - 1;
538   end= s + length;
539 
540   while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
541   {
542     s+= cnv;
543     if (wc > (int) (uchar) 'e' || !wc)
544       break;                                        /* Can't be part of double */
545     *b++= (char) wc;
546   }
547 
548   *endptr= b;
549   res= my_strtod(buf, endptr, err);
550   *endptr= nptr + cs->mbminlen * (size_t) (*endptr - buf);
551   return res;
552 }
553 
554 
555 static ulonglong
my_strntoull10rnd_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t length,int unsign_fl,char ** endptr,int * err)556 my_strntoull10rnd_mb2_or_mb4(const CHARSET_INFO *cs,
557                              const char *nptr, size_t length,
558                              int unsign_fl,
559                              char **endptr, int *err)
560 {
561   char  buf[256], *b= buf;
562   ulonglong res;
563   const uchar *end, *s= (const uchar*) nptr;
564   my_wc_t  wc;
565   int     cnv;
566 
567   /* Cut too long strings */
568   if (length >= sizeof(buf))
569     length= sizeof(buf)-1;
570   end= s + length;
571 
572   while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
573   {
574     s+= cnv;
575     if (wc > (int) (uchar) 'e' || !wc)
576       break;                            /* Can't be a number part */
577     *b++= (char) wc;
578   }
579 
580   res= my_strntoull10rnd_8bit(cs, buf, b - buf, unsign_fl, endptr, err);
581   *endptr= (char*) nptr + cs->mbminlen * (size_t) (*endptr - buf);
582   return res;
583 }
584 
585 
586 /*
587   This is a fast version optimized for the case of radix 10 / -10
588 */
589 
590 static size_t
my_l10tostr_mb2_or_mb4(const CHARSET_INFO * cs,char * dst,size_t len,int radix,long int val)591 my_l10tostr_mb2_or_mb4(const CHARSET_INFO *cs,
592                        char *dst, size_t len, int radix, long int val)
593 {
594   char buffer[66];
595   char *p, *db, *de;
596   long int new_val;
597   int  sl= 0;
598   unsigned long int uval = (unsigned long int) val;
599 
600   p= &buffer[sizeof(buffer) - 1];
601   *p= '\0';
602 
603   if (radix < 0)
604   {
605     if (val < 0)
606     {
607       sl= 1;
608       /* Avoid integer overflow in (-val) for LLONG_MIN (BUG#31799). */
609       uval  = (unsigned long int)0 - uval;
610     }
611   }
612 
613   new_val = (long) (uval / 10);
614   *--p    = '0'+ (char) (uval - (unsigned long) new_val * 10);
615   val= new_val;
616 
617   while (val != 0)
618   {
619     new_val= val / 10;
620     *--p= '0' + (char) (val - new_val * 10);
621     val= new_val;
622   }
623 
624   if (sl)
625   {
626     *--p= '-';
627   }
628 
629   for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
630   {
631     int cnvres= cs->cset->wc_mb(cs,(my_wc_t)p[0],(uchar*) dst, (uchar*) de);
632     if (cnvres > 0)
633       dst+= cnvres;
634     else
635       break;
636   }
637   return (int) (dst - db);
638 }
639 
640 
641 static size_t
my_ll10tostr_mb2_or_mb4(const CHARSET_INFO * cs,char * dst,size_t len,int radix,longlong val)642 my_ll10tostr_mb2_or_mb4(const CHARSET_INFO *cs,
643                         char *dst, size_t len, int radix, longlong val)
644 {
645   char buffer[65];
646   char *p, *db, *de;
647   long long_val;
648   int sl= 0;
649   ulonglong uval= (ulonglong) val;
650 
651   if (radix < 0)
652   {
653     if (val < 0)
654     {
655       sl= 1;
656       /* Avoid integer overflow in (-val) for LLONG_MIN (BUG#31799). */
657       uval = (ulonglong)0 - uval;
658     }
659   }
660 
661   p= &buffer[sizeof(buffer)-1];
662   *p='\0';
663 
664   if (uval == 0)
665   {
666     *--p= '0';
667     goto cnv;
668   }
669 
670   while (uval > (ulonglong) LONG_MAX)
671   {
672     ulonglong quo= uval/(uint) 10;
673     uint rem= (uint) (uval- quo* (uint) 10);
674     *--p= '0' + rem;
675     uval= quo;
676   }
677 
678   long_val= (long) uval;
679   while (long_val != 0)
680   {
681     long quo= long_val/10;
682     *--p= (char) ('0' + (long_val - quo*10));
683     long_val= quo;
684   }
685 
686 cnv:
687   if (sl)
688   {
689     *--p= '-';
690   }
691 
692   for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
693   {
694     int cnvres= cs->cset->wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
695     if (cnvres > 0)
696       dst+= cnvres;
697     else
698       break;
699   }
700   return (int) (dst -db);
701 }
702 
703 #endif /* HAVE_CHARSET_mb2_or_mb4 */
704 
705 
706 #ifdef HAVE_CHARSET_mb2
707 static longlong
my_strtoll10_mb2(const CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)708 my_strtoll10_mb2(const CHARSET_INFO *cs,
709                  const char *nptr, char **endptr, int *error)
710 {
711   const char *s, *end, *start, *n_end, *true_end;
712   uchar c;
713   unsigned long i, j, k;
714   ulonglong li;
715   int negative;
716   ulong cutoff, cutoff2, cutoff3;
717   my_wc_t wc;
718   int res;
719 
720   s= nptr;
721   /* If fixed length string */
722   if (endptr)
723   {
724     /*
725       Make sure string length is even.
726       Odd length indicates a bug in the caller.
727       Assert in debug, round in production.
728     */
729     assert((*endptr - s) % 2 == 0);
730     end= s + ((*endptr - s) / 2) * 2;
731 
732     for ( ; ; ) /* Skip leading spaces and tabs */
733     {
734       res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
735       if (res <= 0)
736         goto no_conv;
737       s+= res;
738       if (wc != ' ' && wc != '\t')
739         break;
740     }
741   }
742   else
743   {
744      /* We don't support null terminated strings in UCS2 */
745      goto no_conv;
746   }
747 
748   /* Check for a sign. */
749   negative= 0;
750   if (wc == '-')
751   {
752     *error= -1;                          /* Mark as negative number */
753     negative= 1;
754     res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
755     if (res <= 0)
756       goto no_conv;
757     s+= res;
758     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
759     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
760     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
761   }
762   else
763   {
764     *error= 0;
765     if (wc == '+')
766     {
767       res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
768       if (res <= 0)
769         goto no_conv;
770       s+= res;
771     }
772     cutoff=  ULONGLONG_MAX / LFACTOR2;
773     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
774     cutoff3=  ULONGLONG_MAX % 100;
775   }
776 
777 
778   /* Handle case where we have a lot of pre-zero */
779   if (wc == '0')
780   {
781     i= 0;
782     for ( ; ; s+= res)
783     {
784       if (s == end)
785         goto end_i;                                /* Return 0 */
786       res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
787       if (res <= 0)
788         goto no_conv;
789       if (wc != '0')
790         break;
791     }
792     while (wc == '0');
793     n_end= s + 2 * INIT_CNT;
794   }
795   else
796   {
797     /* Read first digit to check that it's a valid number */
798     if ((c= (wc - '0')) > 9)
799       goto no_conv;
800     i= c;
801     n_end= s + 2 * (INIT_CNT-1);
802   }
803 
804   /* Handle first 9 digits and store them in i */
805   if (n_end > end)
806     n_end= end;
807   for ( ; ; )
808   {
809     res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) n_end);
810     if (res <= 0)
811       break;
812     s+= res;
813     if ((c= (wc - '0')) > 9)
814       goto end_i;
815     i= i*10+c;
816   }
817   if (s == end)
818     goto end_i;
819 
820   /* Handle next 9 digits and store them in j */
821   j= 0;
822   start= s;                                /* Used to know how much to shift i */
823   n_end= true_end= s + 2 * INIT_CNT;
824   if (n_end > end)
825     n_end= end;
826   do
827   {
828     res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
829     if (res <= 0)
830       goto no_conv;
831     s+= res;
832     if ((c= (wc - '0')) > 9)
833       goto end_i_and_j;
834     j= j*10+c;
835   } while (s != n_end);
836   if (s == end)
837   {
838     if (s != true_end)
839       goto end_i_and_j;
840     goto end3;
841   }
842   res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
843   if (res <= 0)
844     goto no_conv;
845   s+= res;
846   if ((c= (wc - '0')) > 9)
847     goto end3;
848 
849   /* Handle the next 1 or 2 digits and store them in k */
850   k=c;
851   if (s == end)
852     goto end4;
853   res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
854   if (res <= 0)
855     goto no_conv;
856   s+= res;
857   if ((c= (wc - '0')) > 9)
858     goto end4;
859   k= k*10+c;
860   *endptr= (char*) s;
861 
862   /* number string should have ended here */
863   if (s != end && (c= (wc - '0')) <= 9)
864     goto overflow;
865 
866   /* Check that we didn't get an overflow with the last digit */
867   if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
868                                      k > cutoff3)))
869     goto overflow;
870   li=i*LFACTOR2+ (ulonglong) j*100 + k;
871   return (longlong) li;
872 
873 overflow:                                        /* *endptr is set here */
874   *error= MY_ERRNO_ERANGE;
875   return negative ? LLONG_MIN : (longlong) ULONGLONG_MAX;
876 
877 end_i:
878   *endptr= (char*) s;
879   return (negative ? ((longlong) -(long) i) : (longlong) i);
880 
881 end_i_and_j:
882   li= (ulonglong) i * lfactor[(size_t) (s-start) / 2] + j;
883   *endptr= (char*) s;
884   return (negative ? -((longlong) li) : (longlong) li);
885 
886 end3:
887   li=(ulonglong) i*LFACTOR+ (ulonglong) j;
888   *endptr= (char*) s;
889   return (negative ? -((longlong) li) : (longlong) li);
890 
891 end4:
892   li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
893   *endptr= (char*) s;
894   if (negative)
895   {
896    if (li > MAX_NEGATIVE_NUMBER)
897      goto overflow;
898    return -((longlong) li);
899   }
900   return (longlong) li;
901 
902 no_conv:
903   /* There was no number to convert.  */
904   *error= MY_ERRNO_EDOM;
905   *endptr= (char *) nptr;
906   return 0;
907 }
908 
909 
910 static size_t
my_scan_mb2(const CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)911 my_scan_mb2(const CHARSET_INFO *cs,
912             const char *str, const char *end, int sequence_type)
913 {
914   const char *str0= str;
915   my_wc_t wc;
916   int res;
917 
918   switch (sequence_type)
919   {
920   case MY_SEQ_SPACES:
921     for (res= cs->cset->mb_wc(cs, &wc,
922                               (const uchar *) str, (const uchar *) end);
923          res > 0 && wc == ' ';
924          str+= res,
925          res= cs->cset->mb_wc(cs, &wc,
926                               (const uchar *) str, (const uchar *) end))
927     {
928     }
929     return (size_t) (str - str0);
930   default:
931     return 0;
932   }
933 }
934 
935 
936 static void
my_fill_mb2(const CHARSET_INFO * cs,char * s,size_t slen,int fill)937 my_fill_mb2(const CHARSET_INFO *cs, char *s, size_t slen, int fill)
938 {
939   char buf[10];
940   int buflen;
941 
942   assert((slen % 2) == 0);
943 
944   buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
945                           (uchar*) buf + sizeof(buf));
946 
947   assert(buflen > 0);
948 
949   while (slen >= (size_t) buflen)
950   {
951     /* Enough space for the characer */
952     memcpy(s, buf, (size_t) buflen);
953     s+= buflen;
954     slen-= buflen;
955   }
956 
957   /*
958     If there are some more space which is not enough
959     for the whole multibyte character, then add trailing zeros.
960   */
961   for ( ; slen; slen--)
962   {
963     *s++= 0x00;
964   }
965 }
966 
967 
968 static size_t
my_vsnprintf_mb2(char * dst,size_t n,const char * fmt,va_list ap)969 my_vsnprintf_mb2(char *dst, size_t n, const char* fmt, va_list ap)
970 {
971   char *start=dst, *end= dst + n - 1;
972   for (; *fmt ; fmt++)
973   {
974     if (fmt[0] != '%')
975     {
976       if (dst == end)                     /* End of buffer */
977         break;
978 
979       *dst++='\0';
980       *dst++= *fmt;          /* Copy ordinary char */
981       continue;
982     }
983 
984     fmt++;
985 
986     /* Skip if max size is used (to be compatible with printf) */
987     while ( (*fmt >= '0' && *fmt <= '9') || *fmt == '.' || *fmt == '-')
988       fmt++;
989 
990     if (*fmt == 'l')
991       fmt++;
992 
993     if (*fmt == 's')                      /* String parameter */
994     {
995       char *par= va_arg(ap, char *);
996       size_t plen;
997       size_t left_len= (size_t)(end-dst);
998       if (!par)
999         par= (char*) "(null)";
1000       plen= strlen(par);
1001       if (left_len <= plen * 2)
1002         plen = left_len / 2 - 1;
1003 
1004       for ( ; plen ; plen--, dst+=2, par++)
1005       {
1006         dst[0]= '\0';
1007         dst[1]= par[0];
1008       }
1009       continue;
1010     }
1011     else if (*fmt == 'd' || *fmt == 'u')  /* Integer parameter */
1012     {
1013       int iarg;
1014       char nbuf[16];
1015       char *pbuf= nbuf;
1016 
1017       if ((size_t) (end - dst) < 32)
1018         break;
1019       iarg= va_arg(ap, int);
1020       if (*fmt == 'd')
1021         int10_to_str((long) iarg, nbuf, -10);
1022       else
1023         int10_to_str((long) (uint) iarg, nbuf,10);
1024 
1025       for (; pbuf[0]; pbuf++)
1026       {
1027         *dst++= '\0';
1028         *dst++= *pbuf;
1029       }
1030       continue;
1031     }
1032 
1033     /* We come here on '%%', unknown code or too long parameter */
1034     if (dst == end)
1035       break;
1036     *dst++= '\0';
1037     *dst++= '%';                            /* % used as % or unknown code */
1038   }
1039 
1040   assert(dst <= end);
1041   *dst='\0';                                /* End of errmessage */
1042   return (size_t) (dst - start);
1043 }
1044 
1045 
1046 static size_t
my_snprintf_mb2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * to,size_t n,const char * fmt,...)1047 my_snprintf_mb2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1048                 char* to, size_t n, const char* fmt, ...)
1049 {
1050   size_t retval;
1051   va_list args;
1052   va_start(args,fmt);
1053   retval= my_vsnprintf_mb2(to, n, fmt, args);
1054   va_end(args);
1055   return retval;
1056 }
1057 
1058 
1059 static size_t
my_lengthsp_mb2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)1060 my_lengthsp_mb2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1061                 const char *ptr, size_t length)
1062 {
1063   const char *end= ptr + length;
1064   while (end > ptr + 1 && end[-1] == ' ' && end[-2] == '\0')
1065     end-= 2;
1066   return (size_t) (end - ptr);
1067 }
1068 
1069 #endif /* HAVE_CHARSET_mb2*/
1070 
1071 
1072 
1073 
1074 #ifdef HAVE_CHARSET_utf16
1075 
1076 /*
1077   D800..DB7F - Non-provate surrogate high (896 pages)
1078   DB80..DBFF - Private surrogate high     (128 pages)
1079   DC00..DFFF - Surrogate low              (1024 codes in a page)
1080 */
1081 #define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
1082 #define MY_UTF16_SURROGATE_HIGH_LAST  0xDBFF
1083 #define MY_UTF16_SURROGATE_LOW_FIRST  0xDC00
1084 #define MY_UTF16_SURROGATE_LOW_LAST   0xDFFF
1085 
1086 #define MY_UTF16_HIGH_HEAD(x)  ((((uchar) (x)) & 0xFC) == 0xD8)
1087 #define MY_UTF16_LOW_HEAD(x)   ((((uchar) (x)) & 0xFC) == 0xDC)
1088 #define MY_UTF16_SURROGATE(x)  (((x) & 0xF800) == 0xD800)
1089 
1090 #define MY_UTF16_WC2(a, b)       ((a << 8) + b)
1091 
1092 /*
1093   a= 110110??  (<< 18)
1094   b= ????????  (<< 10)
1095   c= 110111??  (<<  8)
1096   d= ????????  (<<  0)
1097 */
1098 #define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
1099                                   ((c & 3) << 8) + d + 0x10000)
1100 
1101 static int
my_utf16_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1102 my_utf16_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1103              my_wc_t *pwc, const uchar *s, const uchar *e)
1104 {
1105   if (s + 2 > e)
1106     return MY_CS_TOOSMALL2;
1107 
1108   /*
1109     High bytes: 0xD[89AB] = B'110110??'
1110     Low bytes:  0xD[CDEF] = B'110111??'
1111     Surrogate mask:  0xFC = B'11111100'
1112   */
1113 
1114   if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
1115   {
1116     if (s + 4 > e)
1117       return MY_CS_TOOSMALL4;
1118 
1119     if (!MY_UTF16_LOW_HEAD(s[2]))  /* Broken surrigate pair */
1120       return MY_CS_ILSEQ;
1121 
1122     *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
1123     return 4;
1124   }
1125 
1126   if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
1127     return MY_CS_ILSEQ;
1128 
1129   *pwc= MY_UTF16_WC2(s[0], s[1]);
1130   return 2;
1131 }
1132 
1133 
1134 static int
my_uni_utf16(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1135 my_uni_utf16(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1136              my_wc_t wc, uchar *s, uchar *e)
1137 {
1138   if (wc <= 0xFFFF)
1139   {
1140     if (s + 2 > e)
1141       return MY_CS_TOOSMALL2;
1142     if (MY_UTF16_SURROGATE(wc))
1143       return MY_CS_ILUNI;
1144     *s++= (uchar) (wc >> 8);
1145     *s= (uchar) (wc & 0xFF);
1146     return 2;
1147   }
1148 
1149   if (wc <= 0x10FFFF)
1150   {
1151     if (s + 4 > e)
1152       return MY_CS_TOOSMALL4;
1153     *s++= (uchar) ((wc-= 0x10000) >> 18) | 0xD8;
1154     *s++= (uchar) (wc >> 10) & 0xFF;
1155     *s++= (uchar) ((wc >> 8) & 3) | 0xDC;
1156     *s= (uchar) wc & 0xFF;
1157     return 4;
1158   }
1159 
1160   return MY_CS_ILUNI;
1161 }
1162 
1163 
1164 static inline void
my_tolower_utf16(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1165 my_tolower_utf16(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1166 {
1167   const MY_UNICASE_CHARACTER *page;
1168   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1169     *wc= page[*wc & 0xFF].tolower;
1170 }
1171 
1172 
1173 static inline void
my_toupper_utf16(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1174 my_toupper_utf16(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1175 {
1176   const MY_UNICASE_CHARACTER *page;
1177   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1178     *wc= page[*wc & 0xFF].toupper;
1179 }
1180 
1181 
1182 static inline void
my_tosort_utf16(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1183 my_tosort_utf16(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1184 {
1185   if (*wc <= uni_plane->maxchar)
1186   {
1187     const MY_UNICASE_CHARACTER *page;
1188     if ((page= uni_plane->page[*wc >> 8]))
1189       *wc= page[*wc & 0xFF].sort;
1190   }
1191   else
1192   {
1193     *wc= MY_CS_REPLACEMENT_CHARACTER;
1194   }
1195 }
1196 
1197 
1198 
1199 static size_t
my_caseup_utf16(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1200 my_caseup_utf16(const CHARSET_INFO *cs, char *src, size_t srclen,
1201                 char *dst MY_ATTRIBUTE((unused)),
1202                 size_t dstlen MY_ATTRIBUTE((unused)))
1203 {
1204   my_wc_t wc;
1205   int res;
1206   char *srcend= src + srclen;
1207   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1208   assert(src == dst && srclen == dstlen);
1209 
1210   while ((src < srcend) &&
1211          (res= cs->cset->mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1212   {
1213     my_toupper_utf16(uni_plane, &wc);
1214     if (res != cs->cset->wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
1215       break;
1216     src+= res;
1217   }
1218   return srclen;
1219 }
1220 
1221 
1222 static void
my_hash_sort_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)1223 my_hash_sort_utf16(const CHARSET_INFO *cs, const uchar *s, size_t slen,
1224                    ulong *n1, ulong *n2)
1225 {
1226   my_wc_t wc;
1227   int res;
1228   const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen);
1229   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1230   ulong tmp1;
1231   ulong tmp2;
1232 
1233   tmp1= *n1;
1234   tmp2= *n2;
1235 
1236   while ((s < e) && (res= cs->cset->mb_wc(cs, &wc,
1237                                           (uchar *) s, (uchar *) e)) > 0)
1238   {
1239     my_tosort_utf16(uni_plane, &wc);
1240     tmp1^= (((tmp1 & 63) + tmp2) * (wc & 0xFF)) + (tmp1 << 8);
1241     tmp2+= 3;
1242     tmp1^= (((tmp1 & 63) + tmp2) * (wc >> 8)) + (tmp1 << 8);
1243     tmp2+= 3;
1244     s+= res;
1245   }
1246 
1247   *n1= tmp1;
1248   *n2= tmp2;
1249 }
1250 
1251 
1252 static size_t
my_casedn_utf16(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1253 my_casedn_utf16(const CHARSET_INFO *cs, char *src, size_t srclen,
1254                 char *dst MY_ATTRIBUTE((unused)),
1255                 size_t dstlen MY_ATTRIBUTE((unused)))
1256 {
1257   my_wc_t wc;
1258   int res;
1259   char *srcend= src + srclen;
1260   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1261   assert(src == dst && srclen == dstlen);
1262 
1263   while ((src < srcend) &&
1264          (res= cs->cset->mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1265   {
1266     my_tolower_utf16(uni_plane, &wc);
1267     if (res != cs->cset->wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
1268       break;
1269     src+= res;
1270   }
1271   return srclen;
1272 }
1273 
1274 
1275 static int
my_strnncoll_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)1276 my_strnncoll_utf16(const CHARSET_INFO *cs,
1277                    const uchar *s, size_t slen,
1278                    const uchar *t, size_t tlen,
1279                    my_bool t_is_prefix)
1280 {
1281   int s_res, t_res;
1282   my_wc_t s_wc= 0, t_wc= 0;
1283   const uchar *se= s + slen;
1284   const uchar *te= t + tlen;
1285   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1286 
1287   while (s < se && t < te)
1288   {
1289     s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1290     t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1291 
1292     if (s_res <= 0 || t_res <= 0)
1293     {
1294       /* Incorrect string, compare by char value */
1295       return my_bincmp(s, se, t, te);
1296     }
1297 
1298     my_tosort_utf16(uni_plane, &s_wc);
1299     my_tosort_utf16(uni_plane, &t_wc);
1300 
1301     if (s_wc != t_wc)
1302     {
1303       return  s_wc > t_wc ? 1 : -1;
1304     }
1305 
1306     s+= s_res;
1307     t+= t_res;
1308   }
1309   return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
1310 }
1311 
1312 
1313 /**
1314   Compare strings, discarding end space
1315 
1316   If one string is shorter as the other, then we space extend the other
1317   so that the strings have equal length.
1318 
1319   This will ensure that the following things hold:
1320 
1321     "a"  == "a "
1322     "a\0" < "a"
1323     "a\0" < "a "
1324 
1325   @param  cs        Character set pinter.
1326   @param  a         First string to compare.
1327   @param  a_length  Length of 'a'.
1328   @param  b         Second string to compare.
1329   @param  b_length  Length of 'b'.
1330 
1331   IMPLEMENTATION
1332 
1333   @return Comparison result.
1334     @retval Negative number, if a less than b.
1335     @retval 0, if a is equal to b
1336     @retval Positive number, if a > b
1337 */
1338 
1339 static int
my_strnncollsp_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)1340 my_strnncollsp_utf16(const CHARSET_INFO *cs,
1341                      const uchar *s, size_t slen,
1342                      const uchar *t, size_t tlen,
1343                      my_bool diff_if_only_endspace_difference)
1344 {
1345   int res;
1346   my_wc_t s_wc= 0, t_wc= 0;
1347   const uchar *se= s + slen, *te= t + tlen;
1348   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1349 
1350   assert((slen % 2) == 0);
1351   assert((tlen % 2) == 0);
1352 
1353 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
1354   diff_if_only_endspace_difference= FALSE;
1355 #endif
1356 
1357   while (s < se && t < te)
1358   {
1359     int s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1360     int t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1361 
1362     if (s_res <= 0 || t_res <= 0)
1363     {
1364       /* Incorrect string, compare bytewise */
1365       return my_bincmp(s, se, t, te);
1366     }
1367 
1368     my_tosort_utf16(uni_plane, &s_wc);
1369     my_tosort_utf16(uni_plane, &t_wc);
1370 
1371     if (s_wc != t_wc)
1372     {
1373       return s_wc > t_wc ? 1 : -1;
1374     }
1375 
1376     s+= s_res;
1377     t+= t_res;
1378   }
1379 
1380   slen= (size_t) (se - s);
1381   tlen= (size_t) (te - t);
1382   res= 0;
1383 
1384   if (slen != tlen)
1385   {
1386     int s_res, swap= 1;
1387     if (diff_if_only_endspace_difference)
1388       res= 1;                                   /* Assume 's' is bigger */
1389     if (slen < tlen)
1390     {
1391       slen= tlen;
1392       s= t;
1393       se= te;
1394       swap= -1;
1395       res= -res;
1396     }
1397 
1398     for ( ; s < se; s+= s_res)
1399     {
1400       if ((s_res= cs->cset->mb_wc(cs, &s_wc, s, se)) <= 0)
1401       {
1402         return 0;
1403       }
1404       if (s_wc != ' ')
1405         return (s_wc < ' ') ? -swap : swap;
1406     }
1407   }
1408   return res;
1409 }
1410 
1411 
1412 static uint
my_ismbchar_utf16(const CHARSET_INFO * cs,const char * b,const char * e)1413 my_ismbchar_utf16(const CHARSET_INFO *cs, const char *b, const char *e)
1414 {
1415   my_wc_t wc;
1416   int res= cs->cset->mb_wc(cs, &wc, (const uchar *) b, (const uchar *) e);
1417   return (uint) (res > 0 ? res : 0);
1418 }
1419 
1420 
1421 static uint
my_mbcharlen_utf16(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))1422 my_mbcharlen_utf16(const CHARSET_INFO *cs  MY_ATTRIBUTE((unused)),
1423                    uint c MY_ATTRIBUTE((unused)))
1424 {
1425   assert(0);
1426   return MY_UTF16_HIGH_HEAD(c) ? 4 : 2;
1427 }
1428 
1429 
1430 static size_t
my_numchars_utf16(const CHARSET_INFO * cs,const char * b,const char * e)1431 my_numchars_utf16(const CHARSET_INFO *cs,
1432                   const char *b, const char *e)
1433 {
1434   size_t nchars= 0;
1435   for ( ; ; nchars++)
1436   {
1437     size_t charlen= my_ismbchar_utf16(cs, b, e);
1438     if (!charlen)
1439       break;
1440     b+= charlen;
1441   }
1442   return nchars;
1443 }
1444 
1445 
1446 static size_t
my_charpos_utf16(const CHARSET_INFO * cs,const char * b,const char * e,size_t pos)1447 my_charpos_utf16(const CHARSET_INFO *cs,
1448                  const char *b, const char *e, size_t pos)
1449 {
1450   const char *b0= b;
1451   uint charlen;
1452 
1453   for ( ; pos; b+= charlen, pos--)
1454   {
1455     if (!(charlen= my_ismbchar(cs, b, e)))
1456       return (e + 2 - b0); /* Error, return pos outside the string */
1457   }
1458   return (size_t) (pos ? (e + 2 - b0) : (b - b0));
1459 }
1460 
1461 
1462 static size_t
my_well_formed_len_utf16(const CHARSET_INFO * cs,const char * b,const char * e,size_t nchars,int * error)1463 my_well_formed_len_utf16(const CHARSET_INFO *cs,
1464                          const char *b, const char *e,
1465                          size_t nchars, int *error)
1466 {
1467   const char *b0= b;
1468   uint charlen;
1469   *error= 0;
1470 
1471   for ( ; nchars; b+= charlen, nchars--)
1472   {
1473     if (!(charlen= my_ismbchar(cs, b, e)))
1474     {
1475       *error= b < e ? 1 : 0;
1476       break;
1477     }
1478   }
1479   return (size_t) (b - b0);
1480 }
1481 
1482 
1483 static int
my_wildcmp_utf16_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1484 my_wildcmp_utf16_ci(const CHARSET_INFO *cs,
1485                     const char *str,const char *str_end,
1486                     const char *wildstr,const char *wildend,
1487                     int escape, int w_one, int w_many)
1488 {
1489   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1490   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1491                             escape, w_one, w_many, uni_plane);
1492 }
1493 
1494 
1495 static int
my_wildcmp_utf16_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1496 my_wildcmp_utf16_bin(const CHARSET_INFO *cs,
1497                      const char *str,const char *str_end,
1498                      const char *wildstr,const char *wildend,
1499                      int escape, int w_one, int w_many)
1500 {
1501   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1502                             escape, w_one, w_many, NULL);
1503 }
1504 
1505 
1506 static int
my_strnncoll_utf16_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)1507 my_strnncoll_utf16_bin(const CHARSET_INFO *cs,
1508                        const uchar *s, size_t slen,
1509                        const uchar *t, size_t tlen,
1510                        my_bool t_is_prefix)
1511 {
1512   int s_res,t_res;
1513   my_wc_t s_wc= 0, t_wc= 0;
1514   const uchar *se=s+slen;
1515   const uchar *te=t+tlen;
1516 
1517   while ( s < se && t < te )
1518   {
1519     s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1520     t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1521 
1522     if (s_res <= 0 || t_res <= 0)
1523     {
1524       /* Incorrect string, compare by char value */
1525       return my_bincmp(s, se, t, te);
1526     }
1527     if (s_wc != t_wc)
1528     {
1529       return s_wc > t_wc ? 1 : -1;
1530     }
1531 
1532     s+= s_res;
1533     t+= t_res;
1534   }
1535   return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
1536 }
1537 
1538 
1539 static int
my_strnncollsp_utf16_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)1540 my_strnncollsp_utf16_bin(const CHARSET_INFO *cs,
1541                          const uchar *s, size_t slen,
1542                          const uchar *t, size_t tlen,
1543                          my_bool diff_if_only_endspace_difference)
1544 {
1545   int res;
1546   my_wc_t s_wc= 0, t_wc= 0;
1547   const uchar *se= s + slen, *te= t + tlen;
1548 
1549   assert((slen % 2) == 0);
1550   assert((tlen % 2) == 0);
1551 
1552 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
1553   diff_if_only_endspace_difference= FALSE;
1554 #endif
1555 
1556   while (s < se && t < te)
1557   {
1558     int s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1559     int t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1560 
1561     if (s_res <= 0 || t_res <= 0)
1562     {
1563       /* Incorrect string, compare bytewise */
1564       return my_bincmp(s, se, t, te);
1565     }
1566 
1567     if (s_wc != t_wc)
1568     {
1569       return s_wc > t_wc ? 1 : -1;
1570     }
1571 
1572     s+= s_res;
1573     t+= t_res;
1574   }
1575 
1576   slen= (size_t) (se - s);
1577   tlen= (size_t) (te - t);
1578   res= 0;
1579 
1580   if (slen != tlen)
1581   {
1582     int s_res, swap= 1;
1583     if (diff_if_only_endspace_difference)
1584       res= 1;                                   /* Assume 's' is bigger */
1585     if (slen < tlen)
1586     {
1587       slen= tlen;
1588       s= t;
1589       se= te;
1590       swap= -1;
1591       res= -res;
1592     }
1593 
1594     for ( ; s < se; s+= s_res)
1595     {
1596       if ((s_res= cs->cset->mb_wc(cs, &s_wc, s, se)) <= 0)
1597       {
1598         return 0;
1599       }
1600       if (s_wc != ' ')
1601         return (s_wc < ' ') ? -swap : swap;
1602     }
1603   }
1604   return res;
1605 }
1606 
1607 
1608 static void
my_hash_sort_utf16_bin(const CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1609 my_hash_sort_utf16_bin(const CHARSET_INFO *cs,
1610                        const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
1611 {
1612   const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len);
1613   ulong tmp1;
1614   ulong tmp2;
1615 
1616   tmp1= *nr1;
1617   tmp2= *nr2;
1618 
1619   for ( ; pos < end ; pos++)
1620   {
1621     tmp1^= (ulong) ((((uint) tmp1 & 63) + tmp2) *
1622                     ((uint)*pos)) + (tmp1 << 8);
1623     tmp2+= 3;
1624   }
1625 
1626   *nr1= tmp1;
1627   *nr2= tmp2;
1628 }
1629 
1630 
1631 static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
1632 {
1633   NULL,                /* init */
1634   my_strnncoll_utf16,
1635   my_strnncollsp_utf16,
1636   my_strnxfrm_unicode,
1637   my_strnxfrmlen_simple,
1638   my_like_range_generic,
1639   my_wildcmp_utf16_ci,
1640   my_strcasecmp_mb2_or_mb4,
1641   my_instr_mb,
1642   my_hash_sort_utf16,
1643   my_propagate_simple
1644 };
1645 
1646 
1647 static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
1648 {
1649   NULL,                /* init */
1650   my_strnncoll_utf16_bin,
1651   my_strnncollsp_utf16_bin,
1652   my_strnxfrm_unicode_full_bin,
1653   my_strnxfrmlen_unicode_full_bin,
1654   my_like_range_generic,
1655   my_wildcmp_utf16_bin,
1656   my_strcasecmp_mb2_or_mb4,
1657   my_instr_mb,
1658   my_hash_sort_utf16_bin,
1659   my_propagate_simple
1660 };
1661 
1662 
1663 MY_CHARSET_HANDLER my_charset_utf16_handler=
1664 {
1665   NULL,                /* init         */
1666   my_ismbchar_utf16,   /* ismbchar     */
1667   my_mbcharlen_utf16,  /* mbcharlen    */
1668   my_numchars_utf16,
1669   my_charpos_utf16,
1670   my_well_formed_len_utf16,
1671   my_lengthsp_mb2,
1672   my_numcells_mb,
1673   my_utf16_uni,        /* mb_wc        */
1674   my_uni_utf16,        /* wc_mb        */
1675   my_mb_ctype_mb,
1676   my_caseup_str_mb2_or_mb4,
1677   my_casedn_str_mb2_or_mb4,
1678   my_caseup_utf16,
1679   my_casedn_utf16,
1680   my_snprintf_mb2,
1681   my_l10tostr_mb2_or_mb4,
1682   my_ll10tostr_mb2_or_mb4,
1683   my_fill_mb2,
1684   my_strntol_mb2_or_mb4,
1685   my_strntoul_mb2_or_mb4,
1686   my_strntoll_mb2_or_mb4,
1687   my_strntoull_mb2_or_mb4,
1688   my_strntod_mb2_or_mb4,
1689   my_strtoll10_mb2,
1690   my_strntoull10rnd_mb2_or_mb4,
1691   my_scan_mb2
1692 };
1693 
1694 
1695 CHARSET_INFO my_charset_utf16_general_ci=
1696 {
1697   54,0,0,              /* number       */
1698   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1699   "utf16",             /* cs name    */
1700   "utf16_general_ci",  /* name         */
1701   "UTF-16 Unicode",    /* comment      */
1702   NULL,                /* tailoring    */
1703   NULL,                /* ctype        */
1704   NULL,                /* to_lower     */
1705   NULL,                /* to_upper     */
1706   NULL,                /* sort_order   */
1707   NULL,                /* uca          */
1708   NULL,                /* tab_to_uni   */
1709   NULL,                /* tab_from_uni */
1710   &my_unicase_default, /* caseinfo     */
1711   NULL,                /* state_map    */
1712   NULL,                /* ident_map    */
1713   1,                   /* strxfrm_multiply */
1714   1,                   /* caseup_multiply  */
1715   1,                   /* casedn_multiply  */
1716   2,                   /* mbminlen     */
1717   4,                   /* mbmaxlen     */
1718   1,                   /* mbmaxlenlen  */
1719   0,                   /* min_sort_char */
1720   0xFFFF,              /* max_sort_char */
1721   ' ',                 /* pad char      */
1722   0,                   /* escape_with_backslash_is_dangerous */
1723   1,                   /* levels_for_compare */
1724   1,                   /* levels_for_order   */
1725   &my_charset_utf16_handler,
1726   &my_collation_utf16_general_ci_handler
1727 };
1728 
1729 
1730 CHARSET_INFO my_charset_utf16_bin=
1731 {
1732   55,0,0,              /* number       */
1733   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1734   "utf16",             /* cs name      */
1735   "utf16_bin",         /* name         */
1736   "UTF-16 Unicode",    /* comment      */
1737   NULL,                /* tailoring    */
1738   NULL,                /* ctype        */
1739   NULL,                /* to_lower     */
1740   NULL,                /* to_upper     */
1741   NULL,                /* sort_order   */
1742   NULL,                /* uca          */
1743   NULL,                /* tab_to_uni   */
1744   NULL,                /* tab_from_uni */
1745   &my_unicase_default, /* caseinfo     */
1746   NULL,                /* state_map    */
1747   NULL,                /* ident_map    */
1748   1,                   /* strxfrm_multiply */
1749   1,                   /* caseup_multiply  */
1750   1,                   /* casedn_multiply  */
1751   2,                   /* mbminlen     */
1752   4,                   /* mbmaxlen     */
1753   1,                   /* mbmaxlenlen  */
1754   0,                   /* min_sort_char */
1755   0xFFFF,              /* max_sort_char */
1756   ' ',                 /* pad char      */
1757   0,                   /* escape_with_backslash_is_dangerous */
1758   1,                   /* levels_for_compare */
1759   1,                   /* levels_for_order   */
1760   &my_charset_utf16_handler,
1761   &my_collation_utf16_bin_handler
1762 };
1763 
1764 
1765 static int
my_utf16le_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1766 my_utf16le_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1767                my_wc_t *pwc, const uchar *s, const uchar *e)
1768 {
1769   my_wc_t lo;
1770 
1771   if (s + 2 > e)
1772     return MY_CS_TOOSMALL2;
1773 
1774   if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
1775       (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
1776     return 2; /* [0000-D7FF,E000-FFFF] */
1777 
1778   if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
1779     return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
1780 
1781   if (s + 4  > e)
1782     return MY_CS_TOOSMALL4;
1783 
1784   s+= 2;
1785 
1786   if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
1787       lo > MY_UTF16_SURROGATE_LOW_LAST)
1788     return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
1789 
1790   *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
1791   return 4;
1792 }
1793 
1794 
1795 static int
my_uni_utf16le(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1796 my_uni_utf16le(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1797                my_wc_t wc, uchar *s, uchar *e)
1798 {
1799   if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
1800       (wc > MY_UTF16_SURROGATE_LOW_LAST &&
1801        wc <= 0xFFFF))
1802   {
1803     if (s + 2 > e)
1804       return MY_CS_TOOSMALL2;
1805     int2store(s, (uint16)wc);
1806     return 2; /* [0000-D7FF,E000-FFFF] */
1807   }
1808 
1809   if (wc < 0xFFFF || wc > 0x10FFFF)
1810     return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
1811 
1812   if (s + 4 > e)
1813     return MY_CS_TOOSMALL4;
1814 
1815   wc-= 0x10000;
1816   int2store(s,     (0xD800 | ((wc >> 10) & 0x3FF))); s+= 2;
1817   int2store(s,     (0xDC00 | (wc & 0x3FF)));
1818   return 4; /* [010000-10FFFF] */
1819 }
1820 
1821 
1822 static size_t
my_lengthsp_utf16le(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)1823 my_lengthsp_utf16le(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1824                     const char *ptr, size_t length)
1825 {
1826   const char *end= ptr + length;
1827   while (end > ptr + 1 && uint2korr((uchar*) end - 2) == 0x20)
1828     end-= 2;
1829   return (size_t) (end - ptr);
1830 }
1831 
1832 
1833 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
1834 {
1835   NULL,                /* init         */
1836   my_ismbchar_utf16,
1837   my_mbcharlen_utf16,
1838   my_numchars_utf16,
1839   my_charpos_utf16,
1840   my_well_formed_len_utf16,
1841   my_lengthsp_utf16le,
1842   my_numcells_mb,
1843   my_utf16le_uni,      /* mb_wc        */
1844   my_uni_utf16le,      /* wc_mb        */
1845   my_mb_ctype_mb,
1846   my_caseup_str_mb2_or_mb4,
1847   my_casedn_str_mb2_or_mb4,
1848   my_caseup_utf16,
1849   my_casedn_utf16,
1850   my_snprintf_mb2,
1851   my_l10tostr_mb2_or_mb4,
1852   my_ll10tostr_mb2_or_mb4,
1853   my_fill_mb2,
1854   my_strntol_mb2_or_mb4,
1855   my_strntoul_mb2_or_mb4,
1856   my_strntoll_mb2_or_mb4,
1857   my_strntoull_mb2_or_mb4,
1858   my_strntod_mb2_or_mb4,
1859   my_strtoll10_mb2,
1860   my_strntoull10rnd_mb2_or_mb4,
1861   my_scan_mb2
1862 };
1863 
1864 
1865 CHARSET_INFO my_charset_utf16le_general_ci=
1866 {
1867   56,0,0,              /* number       */
1868   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1869   "utf16le",           /* cs name    */
1870   "utf16le_general_ci",/* name         */
1871   "UTF-16LE Unicode",  /* comment      */
1872   NULL,                /* tailoring    */
1873   NULL,                /* ctype        */
1874   NULL,                /* to_lower     */
1875   NULL,                /* to_upper     */
1876   NULL,                /* sort_order   */
1877   NULL,                /* uca          */
1878   NULL,                /* tab_to_uni   */
1879   NULL,                /* tab_from_uni */
1880   &my_unicase_default, /* caseinfo     */
1881   NULL,                /* state_map    */
1882   NULL,                /* ident_map    */
1883   1,                   /* strxfrm_multiply */
1884   1,                   /* caseup_multiply  */
1885   1,                   /* casedn_multiply  */
1886   2,                   /* mbminlen     */
1887   4,                   /* mbmaxlen     */
1888   1,                   /* mbmaxlenlen  */
1889   0,                   /* min_sort_char */
1890   0xFFFF,              /* max_sort_char */
1891   ' ',                 /* pad char      */
1892   0,                   /* escape_with_backslash_is_dangerous */
1893   1,                   /* levels_for_compare */
1894   1,                   /* levels_for_order   */
1895   &my_charset_utf16le_handler,
1896   &my_collation_utf16_general_ci_handler
1897 };
1898 
1899 
1900 CHARSET_INFO my_charset_utf16le_bin=
1901 {
1902   62,0,0,              /* number       */
1903   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1904   "utf16le",           /* cs name      */
1905   "utf16le_bin",       /* name         */
1906   "UTF-16LE Unicode",  /* comment      */
1907   NULL,                /* tailoring    */
1908   NULL,                /* ctype        */
1909   NULL,                /* to_lower     */
1910   NULL,                /* to_upper     */
1911   NULL,                /* sort_order   */
1912   NULL,                /* uca          */
1913   NULL,                /* tab_to_uni   */
1914   NULL,                /* tab_from_uni */
1915   &my_unicase_default, /* caseinfo     */
1916   NULL,                /* state_map    */
1917   NULL,                /* ident_map    */
1918   1,                   /* strxfrm_multiply */
1919   1,                   /* caseup_multiply  */
1920   1,                   /* casedn_multiply  */
1921   2,                   /* mbminlen     */
1922   4,                   /* mbmaxlen     */
1923   1,                   /* mbmaxlenlen  */
1924   0,                   /* min_sort_char */
1925   0xFFFF,              /* max_sort_char */
1926   ' ',                 /* pad char      */
1927   0,                   /* escape_with_backslash_is_dangerous */
1928   1,                   /* levels_for_compare */
1929   1,                   /* levels_for_order   */
1930   &my_charset_utf16le_handler,
1931   &my_collation_utf16_bin_handler
1932 };
1933 
1934 
1935 #endif /* HAVE_CHARSET_utf16 */
1936 
1937 
1938 #ifdef HAVE_CHARSET_utf32
1939 
1940 static int
my_utf32_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1941 my_utf32_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1942              my_wc_t *pwc, const uchar *s, const uchar *e)
1943 {
1944   if (s + 4 > e)
1945     return MY_CS_TOOSMALL4;
1946   *pwc= (((my_wc_t)s[0]) << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
1947   return 4;
1948 }
1949 
1950 
1951 static int
my_uni_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1952 my_uni_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1953              my_wc_t wc, uchar *s, uchar *e)
1954 {
1955   if (s + 4 > e)
1956     return MY_CS_TOOSMALL4;
1957 
1958   s[0]= (uchar) (wc >> 24);
1959   s[1]= (uchar) (wc >> 16) & 0xFF;
1960   s[2]= (uchar) (wc >> 8)  & 0xFF;
1961   s[3]= (uchar) wc & 0xFF;
1962   return 4;
1963 }
1964 
1965 
1966 static inline void
my_tolower_utf32(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1967 my_tolower_utf32(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1968 {
1969   const MY_UNICASE_CHARACTER *page;
1970   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1971     *wc= page[*wc & 0xFF].tolower;
1972 }
1973 
1974 
1975 static inline void
my_toupper_utf32(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1976 my_toupper_utf32(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1977 {
1978   const MY_UNICASE_CHARACTER *page;
1979   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1980     *wc= page[*wc & 0xFF].toupper;
1981 }
1982 
1983 
1984 static inline void
my_tosort_utf32(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1985 my_tosort_utf32(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1986 {
1987   if (*wc <= uni_plane->maxchar)
1988   {
1989     const MY_UNICASE_CHARACTER *page;
1990     if ((page= uni_plane->page[*wc >> 8]))
1991       *wc= page[*wc & 0xFF].sort;
1992   }
1993   else
1994   {
1995     *wc= MY_CS_REPLACEMENT_CHARACTER;
1996   }
1997 }
1998 
1999 
2000 static size_t
my_caseup_utf32(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2001 my_caseup_utf32(const CHARSET_INFO *cs, char *src, size_t srclen,
2002                 char *dst MY_ATTRIBUTE((unused)),
2003                 size_t dstlen MY_ATTRIBUTE((unused)))
2004 {
2005   my_wc_t wc;
2006   int res;
2007   char *srcend= src + srclen;
2008   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2009   assert(src == dst && srclen == dstlen);
2010 
2011   while ((src < srcend) &&
2012          (res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
2013   {
2014     my_toupper_utf32(uni_plane, &wc);
2015     if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
2016       break;
2017     src+= res;
2018   }
2019   return srclen;
2020 }
2021 
2022 
2023 static void
my_hash_sort_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)2024 my_hash_sort_utf32(const CHARSET_INFO *cs, const uchar *s, size_t slen,
2025                    ulong *n1, ulong *n2)
2026 {
2027   my_wc_t wc;
2028   int res;
2029   const uchar *e= s + slen;
2030   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2031   ulong tmp1;
2032   ulong tmp2;
2033   uint ch;
2034 
2035   /* Skip trailing spaces */
2036   while (e > s + 3 && e[-1] == ' ' && !e[-2] && !e[-3] && !e[-4])
2037     e-= 4;
2038 
2039   tmp1= *n1;
2040   tmp2= *n2;
2041 
2042   while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
2043   {
2044     my_tosort_utf32(uni_plane, &wc);
2045 
2046     ch= (wc >> 24);
2047     tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2048     tmp2+= 3;
2049 
2050     ch= (wc >> 16) & 0xFF;
2051     tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2052     tmp2+= 3;
2053 
2054     ch= (wc >> 8)  & 0xFF;
2055     tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2056     tmp2+= 3;
2057 
2058     ch= (wc & 0xFF);
2059     tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2060     tmp2+= 3;
2061 
2062     s+= res;
2063   }
2064 
2065   *n1= tmp1;
2066   *n2= tmp2;
2067 }
2068 
2069 
2070 static size_t
my_casedn_utf32(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2071 my_casedn_utf32(const CHARSET_INFO *cs, char *src, size_t srclen,
2072                 char *dst MY_ATTRIBUTE((unused)),
2073                 size_t dstlen MY_ATTRIBUTE((unused)))
2074 {
2075   my_wc_t wc;
2076   int res;
2077   char *srcend= src + srclen;
2078   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2079   assert(src == dst && srclen == dstlen);
2080 
2081   while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
2082   {
2083     my_tolower_utf32(uni_plane,&wc);
2084     if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
2085       break;
2086     src+= res;
2087   }
2088   return srclen;
2089 }
2090 
2091 
2092 static int
my_strnncoll_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)2093 my_strnncoll_utf32(const CHARSET_INFO *cs,
2094                    const uchar *s, size_t slen,
2095                    const uchar *t, size_t tlen,
2096                    my_bool t_is_prefix)
2097 {
2098   my_wc_t s_wc= 0, t_wc= 0;
2099   const uchar *se= s + slen;
2100   const uchar *te= t + tlen;
2101   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2102 
2103   while (s < se && t < te)
2104   {
2105     int s_res= my_utf32_uni(cs, &s_wc, s, se);
2106     int t_res= my_utf32_uni(cs, &t_wc, t, te);
2107 
2108     if ( s_res <= 0 || t_res <= 0)
2109     {
2110       /* Incorrect string, compare by char value */
2111       return my_bincmp(s, se, t, te);
2112     }
2113 
2114     my_tosort_utf32(uni_plane, &s_wc);
2115     my_tosort_utf32(uni_plane, &t_wc);
2116 
2117     if (s_wc != t_wc)
2118     {
2119       return s_wc > t_wc ? 1 : -1;
2120     }
2121 
2122     s+= s_res;
2123     t+= t_res;
2124   }
2125   return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
2126 }
2127 
2128 
2129 /**
2130   Compare strings, discarding end space
2131 
2132   If one string is shorter as the other, then we space extend the other
2133   so that the strings have equal length.
2134 
2135   This will ensure that the following things hold:
2136 
2137     "a"  == "a "
2138     "a\0" < "a"
2139     "a\0" < "a "
2140 
2141   @param  cs        Character set pinter.
2142   @param  a         First string to compare.
2143   @param  a_length  Length of 'a'.
2144   @param  b         Second string to compare.
2145   @param  b_length  Length of 'b'.
2146 
2147   IMPLEMENTATION
2148 
2149   @return Comparison result.
2150     @retval Negative number, if a less than b.
2151     @retval 0, if a is equal to b
2152     @retval Positive number, if a > b
2153 */
2154 
2155 
2156 static int
my_strnncollsp_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)2157 my_strnncollsp_utf32(const CHARSET_INFO *cs,
2158                      const uchar *s, size_t slen,
2159                      const uchar *t, size_t tlen,
2160                      my_bool diff_if_only_endspace_difference)
2161 {
2162   int res;
2163   my_wc_t s_wc= 0, t_wc= 0;
2164   const uchar *se= s + slen, *te= t + tlen;
2165   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2166 
2167   assert((slen % 4) == 0);
2168   assert((tlen % 4) == 0);
2169 
2170 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
2171   diff_if_only_endspace_difference= FALSE;
2172 #endif
2173 
2174   while ( s < se && t < te )
2175   {
2176     int s_res= my_utf32_uni(cs, &s_wc, s, se);
2177     int t_res= my_utf32_uni(cs, &t_wc, t, te);
2178 
2179     if ( s_res <= 0 || t_res <= 0 )
2180     {
2181       /* Incorrect string, compare bytewise */
2182       return my_bincmp(s, se, t, te);
2183     }
2184 
2185     my_tosort_utf32(uni_plane, &s_wc);
2186     my_tosort_utf32(uni_plane, &t_wc);
2187 
2188     if ( s_wc != t_wc )
2189     {
2190       return s_wc > t_wc ? 1 : -1;
2191     }
2192 
2193     s+= s_res;
2194     t+= t_res;
2195   }
2196 
2197   slen= (size_t) (se - s);
2198   tlen= (size_t) (te - t);
2199   res= 0;
2200 
2201   if (slen != tlen)
2202   {
2203     int s_res, swap= 1;
2204     if (diff_if_only_endspace_difference)
2205       res= 1;                                   /* Assume 's' is bigger */
2206     if (slen < tlen)
2207     {
2208       slen= tlen;
2209       s= t;
2210       se= te;
2211       swap= -1;
2212       res= -res;
2213     }
2214 
2215     for ( ; s < se; s+= s_res)
2216     {
2217       if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0)
2218       {
2219         assert(0);
2220         return 0;
2221       }
2222       if (s_wc != ' ')
2223         return (s_wc < ' ') ? -swap : swap;
2224     }
2225   }
2226   return res;
2227 }
2228 
2229 
2230 static size_t
my_strnxfrmlen_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),size_t len)2231 my_strnxfrmlen_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2232                      size_t len)
2233 {
2234   return len / 2;
2235 }
2236 
2237 
2238 static uint
my_ismbchar_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)))2239 my_ismbchar_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2240                   const char *b MY_ATTRIBUTE((unused)),
2241                   const char *e MY_ATTRIBUTE((unused)))
2242 {
2243   return 4;
2244 }
2245 
2246 
2247 static uint
my_mbcharlen_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))2248 my_mbcharlen_utf32(const CHARSET_INFO *cs  MY_ATTRIBUTE((unused)) ,
2249                    uint c MY_ATTRIBUTE((unused)))
2250 {
2251   return 4;
2252 }
2253 
2254 
2255 static size_t
my_vsnprintf_utf32(char * dst,size_t n,const char * fmt,va_list ap)2256 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
2257 {
2258   char *start= dst, *end= dst + n;
2259   assert((n % 4) == 0);
2260   for (; *fmt ; fmt++)
2261   {
2262     if (fmt[0] != '%')
2263     {
2264       if (dst >= end)                        /* End of buffer */
2265         break;
2266 
2267       *dst++= '\0';
2268       *dst++= '\0';
2269       *dst++= '\0';
2270       *dst++= *fmt;        /* Copy ordinary char */
2271       continue;
2272     }
2273 
2274     fmt++;
2275 
2276     /* Skip if max size is used (to be compatible with printf) */
2277     while ( (*fmt>='0' && *fmt<='9') || *fmt == '.' || *fmt == '-')
2278       fmt++;
2279 
2280     if (*fmt == 'l')
2281       fmt++;
2282 
2283     if (*fmt == 's')                                /* String parameter */
2284     {
2285       char *par= va_arg(ap, char *);
2286       size_t plen;
2287       size_t left_len= (size_t)(end - dst);
2288       if (!par) par= (char*)"(null)";
2289       plen= strlen(par);
2290       if (left_len <= plen*4)
2291         plen= left_len / 4 - 1;
2292 
2293       for ( ; plen ; plen--, dst+= 4, par++)
2294       {
2295         dst[0]= '\0';
2296         dst[1]= '\0';
2297         dst[2]= '\0';
2298         dst[3]= par[0];
2299       }
2300       continue;
2301     }
2302     else if (*fmt == 'd' || *fmt == 'u')        /* Integer parameter */
2303     {
2304       int iarg;
2305       char nbuf[16];
2306       char *pbuf= nbuf;
2307 
2308       if ((size_t) (end - dst) < 64)
2309         break;
2310       iarg= va_arg(ap, int);
2311       if (*fmt == 'd')
2312         int10_to_str((long) iarg, nbuf, -10);
2313       else
2314         int10_to_str((long) (uint) iarg,nbuf,10);
2315 
2316       for (; pbuf[0]; pbuf++)
2317       {
2318         *dst++= '\0';
2319         *dst++= '\0';
2320         *dst++= '\0';
2321         *dst++= *pbuf;
2322       }
2323       continue;
2324     }
2325 
2326     /* We come here on '%%', unknown code or too long parameter */
2327     if (dst == end)
2328       break;
2329     *dst++= '\0';
2330     *dst++= '\0';
2331     *dst++= '\0';
2332     *dst++= '%';    /* % used as % or unknown code */
2333   }
2334 
2335   assert(dst < end);
2336   *dst++= '\0';
2337   *dst++= '\0';
2338   *dst++= '\0';
2339   *dst++= '\0';     /* End of errmessage */
2340   return (size_t) (dst - start - 4);
2341 }
2342 
2343 
2344 static size_t
my_snprintf_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * to,size_t n,const char * fmt,...)2345 my_snprintf_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2346                   char* to, size_t n, const char* fmt, ...)
2347 {
2348   size_t retval;
2349   va_list args;
2350   va_start(args,fmt);
2351   retval= my_vsnprintf_utf32(to, n, fmt, args);
2352   va_end(args);
2353   return retval;
2354 }
2355 
2356 
2357 static longlong
my_strtoll10_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * nptr,char ** endptr,int * error)2358 my_strtoll10_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2359                    const char *nptr, char **endptr, int *error)
2360 {
2361   const char *s, *end, *start, *n_end, *true_end;
2362   uchar c;
2363   unsigned long i, j, k;
2364   ulonglong li;
2365   int negative;
2366   ulong cutoff, cutoff2, cutoff3;
2367 
2368   s= nptr;
2369   /* If fixed length string */
2370   if (endptr)
2371   {
2372     /* Make sure string length is even */
2373     end= s + ((*endptr - s) / 4) * 4;
2374     while (s < end && !s[0] && !s[1] && !s[2] &&
2375            (s[3] == ' ' || s[3] == '\t'))
2376       s+= 4;
2377     if (s == end)
2378       goto no_conv;
2379   }
2380   else
2381   {
2382      /* We don't support null terminated strings in UCS2 */
2383      goto no_conv;
2384   }
2385 
2386   /* Check for a sign. */
2387   negative= 0;
2388   if (!s[0] && !s[1] && !s[2] && s[3] == '-')
2389   {
2390     *error= -1;                                        /* Mark as negative number */
2391     negative= 1;
2392     s+= 4;
2393     if (s == end)
2394       goto no_conv;
2395     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
2396     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
2397     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
2398   }
2399   else
2400   {
2401     *error= 0;
2402     if (!s[0] && !s[1] && !s[2] && s[3] == '+')
2403     {
2404       s+= 4;
2405       if (s == end)
2406         goto no_conv;
2407     }
2408     cutoff=  ULONGLONG_MAX / LFACTOR2;
2409     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
2410     cutoff3=  ULONGLONG_MAX % 100;
2411   }
2412 
2413   /* Handle case where we have a lot of pre-zero */
2414   if (!s[0] && !s[1] && !s[2] && s[3] == '0')
2415   {
2416     i= 0;
2417     do
2418     {
2419       s+= 4;
2420       if (s == end)
2421         goto end_i;                                /* Return 0 */
2422     }
2423     while (!s[0] && !s[1] && !s[2] && s[3] == '0');
2424     n_end= s + 4 * INIT_CNT;
2425   }
2426   else
2427   {
2428     /* Read first digit to check that it's a valid number */
2429     if (s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2430       goto no_conv;
2431     i= c;
2432     s+= 4;
2433     n_end= s + 4 * (INIT_CNT-1);
2434   }
2435 
2436   /* Handle first 9 digits and store them in i */
2437   if (n_end > end)
2438     n_end= end;
2439   for (; s != n_end ; s+= 4)
2440   {
2441     if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2442       goto end_i;
2443     i= i * 10 + c;
2444   }
2445   if (s == end)
2446     goto end_i;
2447 
2448   /* Handle next 9 digits and store them in j */
2449   j= 0;
2450   start= s;                                /* Used to know how much to shift i */
2451   n_end= true_end= s + 4 * INIT_CNT;
2452   if (n_end > end)
2453     n_end= end;
2454   do
2455   {
2456     if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2457       goto end_i_and_j;
2458     j= j * 10 + c;
2459     s+= 4;
2460   } while (s != n_end);
2461   if (s == end)
2462   {
2463     if (s != true_end)
2464       goto end_i_and_j;
2465     goto end3;
2466   }
2467   if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2468     goto end3;
2469 
2470   /* Handle the next 1 or 2 digits and store them in k */
2471   k=c;
2472   s+= 4;
2473   if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2474     goto end4;
2475   k= k * 10 + c;
2476   s+= 2;
2477   *endptr= (char*) s;
2478 
2479   /* number string should have ended here */
2480   if (s != end && !s[0] && !s[1] && !s[2] && (c= (s[3] - '0')) <= 9)
2481     goto overflow;
2482 
2483   /* Check that we didn't get an overflow with the last digit */
2484   if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
2485                                      k > cutoff3)))
2486     goto overflow;
2487   li= i * LFACTOR2+ (ulonglong) j * 100 + k;
2488   return (longlong) li;
2489 
2490 overflow:                                        /* *endptr is set here */
2491   *error= MY_ERRNO_ERANGE;
2492   return negative ? LLONG_MIN : (longlong) ULONGLONG_MAX;
2493 
2494 end_i:
2495   *endptr= (char*) s;
2496   return (negative ? ((longlong) -(long) i) : (longlong) i);
2497 
2498 end_i_and_j:
2499   li= (ulonglong) i * lfactor[(size_t) (s-start) / 4] + j;
2500   *endptr= (char*) s;
2501   return (negative ? -((longlong) li) : (longlong) li);
2502 
2503 end3:
2504   li= (ulonglong) i*LFACTOR+ (ulonglong) j;
2505   *endptr= (char*) s;
2506   return (negative ? -((longlong) li) : (longlong) li);
2507 
2508 end4:
2509   li= (ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
2510   *endptr= (char*) s;
2511   if (negative)
2512   {
2513    if (li > MAX_NEGATIVE_NUMBER)
2514      goto overflow;
2515    return -((longlong) li);
2516   }
2517   return (longlong) li;
2518 
2519 no_conv:
2520   /* There was no number to convert.  */
2521   *error= MY_ERRNO_EDOM;
2522   *endptr= (char *) nptr;
2523   return 0;
2524 }
2525 
2526 
2527 static size_t
my_numchars_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e)2528 my_numchars_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2529                   const char *b, const char *e)
2530 {
2531   return (size_t) (e - b) / 4;
2532 }
2533 
2534 
2535 static size_t
my_charpos_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t pos)2536 my_charpos_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2537                  const char *b, const char *e, size_t pos)
2538 {
2539   size_t string_length= (size_t) (e - b);
2540   return pos * 4 > string_length ? string_length + 4 : pos * 4;
2541 }
2542 
2543 
2544 static size_t
my_well_formed_len_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t nchars,int * error)2545 my_well_formed_len_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2546                          const char *b, const char *e,
2547                          size_t nchars, int *error)
2548 {
2549   /* Ensure string length is divisible by 4 */
2550   const char *b0= b;
2551   size_t length= e - b;
2552   assert((length % 4) == 0);
2553   *error= 0;
2554   nchars*= 4;
2555   if (length > nchars)
2556   {
2557     length= nchars;
2558     e= b + nchars;
2559   }
2560   for (; b < e; b+= 4)
2561   {
2562     /* Don't accept characters greater than U+10FFFF */
2563     if (b[0] || (uchar) b[1] > 0x10)
2564     {
2565       *error= 1;
2566       return b - b0;
2567     }
2568   }
2569   return length;
2570 }
2571 
2572 
2573 static
my_fill_utf32(const CHARSET_INFO * cs,char * s,size_t slen,int fill)2574 void my_fill_utf32(const CHARSET_INFO *cs,
2575                    char *s, size_t slen, int fill)
2576 {
2577   char buf[10];
2578   char *e= s + slen;
2579 
2580   assert((slen % 4) == 0);
2581   {
2582 #ifndef NDEBUG
2583     uint buflen=
2584 #endif
2585       cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
2586                       (uchar*) buf + sizeof(buf));
2587     assert(buflen == 4);
2588   }
2589   while (s < e)
2590   {
2591     memcpy(s, buf, 4);
2592     s+= 4;
2593   }
2594 }
2595 
2596 
2597 static size_t
my_lengthsp_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)2598 my_lengthsp_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2599                   const char *ptr, size_t length)
2600 {
2601   const char *end= ptr + length;
2602   assert((length % 4) == 0);
2603   while (end > ptr + 3 && end[-1] == ' ' && !end[-2] && !end[-3] && !end[-4])
2604     end-= 4;
2605   return (size_t) (end - ptr);
2606 }
2607 
2608 
2609 static int
my_wildcmp_utf32_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2610 my_wildcmp_utf32_ci(const CHARSET_INFO *cs,
2611                     const char *str, const char *str_end,
2612                     const char *wildstr, const char *wildend,
2613                     int escape, int w_one, int w_many)
2614 {
2615   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2616   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2617                             escape, w_one, w_many, uni_plane);
2618 }
2619 
2620 
2621 static int
my_wildcmp_utf32_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2622 my_wildcmp_utf32_bin(const CHARSET_INFO *cs,
2623                      const char *str,const char *str_end,
2624                      const char *wildstr,const char *wildend,
2625                      int escape, int w_one, int w_many)
2626 {
2627   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2628                             escape, w_one, w_many, NULL);
2629 }
2630 
2631 
2632 static int
my_strnncoll_utf32_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)2633 my_strnncoll_utf32_bin(const CHARSET_INFO *cs,
2634                        const uchar *s, size_t slen,
2635                        const uchar *t, size_t tlen,
2636                        my_bool t_is_prefix)
2637 {
2638   my_wc_t s_wc= 0, t_wc= 0;
2639   const uchar *se= s + slen;
2640   const uchar *te= t + tlen;
2641 
2642   while (s < se && t < te)
2643   {
2644     int s_res= my_utf32_uni(cs, &s_wc, s, se);
2645     int t_res= my_utf32_uni(cs, &t_wc, t, te);
2646 
2647     if (s_res <= 0 || t_res <= 0)
2648     {
2649       /* Incorrect string, compare by char value */
2650       return my_bincmp(s, se, t, te);
2651     }
2652     if (s_wc != t_wc)
2653     {
2654       return  s_wc > t_wc ? 1 : -1;
2655     }
2656 
2657     s+= s_res;
2658     t+= t_res;
2659   }
2660   return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t)));
2661 }
2662 
2663 
2664 static inline my_wc_t
my_utf32_get(const uchar * s)2665 my_utf32_get(const uchar *s)
2666 {
2667   return
2668     ((my_wc_t) s[0] << 24) +
2669     ((my_wc_t) s[1] << 16) +
2670     ((my_wc_t) s[2] << 8) +
2671     s[3];
2672 }
2673 
2674 
2675 static int
my_strnncollsp_utf32_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))2676 my_strnncollsp_utf32_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2677                          const uchar *s, size_t slen,
2678                          const uchar *t, size_t tlen,
2679                          my_bool diff_if_only_endspace_difference
2680                          MY_ATTRIBUTE((unused)))
2681 {
2682   const uchar *se, *te;
2683   size_t minlen;
2684 
2685   assert((slen % 4) == 0);
2686   assert((tlen % 4) == 0);
2687 
2688   se= s + slen;
2689   te= t + tlen;
2690 
2691   for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4)
2692   {
2693     my_wc_t s_wc= my_utf32_get(s);
2694     my_wc_t t_wc= my_utf32_get(t);
2695     if (s_wc != t_wc)
2696       return  s_wc > t_wc ? 1 : -1;
2697 
2698     s+= 4;
2699     t+= 4;
2700   }
2701 
2702   if (slen != tlen)
2703   {
2704     int swap= 1;
2705     if (slen < tlen)
2706     {
2707       s= t;
2708       se= te;
2709       swap= -1;
2710     }
2711 
2712     for ( ; s < se ; s+= 4)
2713     {
2714       my_wc_t s_wc= my_utf32_get(s);
2715       if (s_wc != ' ')
2716         return (s_wc < ' ') ? -swap : swap;
2717     }
2718   }
2719   return 0;
2720 }
2721 
2722 
2723 static size_t
my_scan_utf32(const CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)2724 my_scan_utf32(const CHARSET_INFO *cs,
2725               const char *str, const char *end, int sequence_type)
2726 {
2727   const char *str0= str;
2728 
2729   switch (sequence_type)
2730   {
2731   case MY_SEQ_SPACES:
2732     for ( ; str < end; )
2733     {
2734       my_wc_t wc;
2735       int res= my_utf32_uni(cs, &wc, (uchar*) str, (uchar*) end);
2736       if (res < 0 || wc != ' ')
2737         break;
2738       str+= res;
2739     }
2740     return (size_t) (str - str0);
2741   default:
2742     return 0;
2743   }
2744 }
2745 
2746 
2747 static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
2748 {
2749   NULL, /* init */
2750   my_strnncoll_utf32,
2751   my_strnncollsp_utf32,
2752   my_strnxfrm_unicode,
2753   my_strnxfrmlen_utf32,
2754   my_like_range_generic,
2755   my_wildcmp_utf32_ci,
2756   my_strcasecmp_mb2_or_mb4,
2757   my_instr_mb,
2758   my_hash_sort_utf32,
2759   my_propagate_simple
2760 };
2761 
2762 
2763 static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
2764 {
2765   NULL, /* init */
2766   my_strnncoll_utf32_bin,
2767   my_strnncollsp_utf32_bin,
2768   my_strnxfrm_unicode_full_bin,
2769   my_strnxfrmlen_unicode_full_bin,
2770   my_like_range_generic,
2771   my_wildcmp_utf32_bin,
2772   my_strcasecmp_mb2_or_mb4,
2773   my_instr_mb,
2774   my_hash_sort_utf32,
2775   my_propagate_simple
2776 };
2777 
2778 
2779 MY_CHARSET_HANDLER my_charset_utf32_handler=
2780 {
2781   NULL, /* init */
2782   my_ismbchar_utf32,
2783   my_mbcharlen_utf32,
2784   my_numchars_utf32,
2785   my_charpos_utf32,
2786   my_well_formed_len_utf32,
2787   my_lengthsp_utf32,
2788   my_numcells_mb,
2789   my_utf32_uni,
2790   my_uni_utf32,
2791   my_mb_ctype_mb,
2792   my_caseup_str_mb2_or_mb4,
2793   my_casedn_str_mb2_or_mb4,
2794   my_caseup_utf32,
2795   my_casedn_utf32,
2796   my_snprintf_utf32,
2797   my_l10tostr_mb2_or_mb4,
2798   my_ll10tostr_mb2_or_mb4,
2799   my_fill_utf32,
2800   my_strntol_mb2_or_mb4,
2801   my_strntoul_mb2_or_mb4,
2802   my_strntoll_mb2_or_mb4,
2803   my_strntoull_mb2_or_mb4,
2804   my_strntod_mb2_or_mb4,
2805   my_strtoll10_utf32,
2806   my_strntoull10rnd_mb2_or_mb4,
2807   my_scan_utf32
2808 };
2809 
2810 
2811 CHARSET_INFO my_charset_utf32_general_ci=
2812 {
2813   60,0,0,              /* number       */
2814   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT|MY_CS_NONASCII,
2815   "utf32",             /* cs name    */
2816   "utf32_general_ci",  /* name         */
2817   "UTF-32 Unicode",    /* comment      */
2818   NULL,                /* tailoring    */
2819   NULL,                /* ctype        */
2820   NULL,                /* to_lower     */
2821   NULL,                /* to_upper     */
2822   NULL,                /* sort_order   */
2823   NULL,                /* uca          */
2824   NULL,                /* tab_to_uni   */
2825   NULL,                /* tab_from_uni */
2826   &my_unicase_default, /* caseinfo     */
2827   NULL,                /* state_map    */
2828   NULL,                /* ident_map    */
2829   1,                   /* strxfrm_multiply */
2830   1,                   /* caseup_multiply  */
2831   1,                   /* casedn_multiply  */
2832   4,                   /* mbminlen     */
2833   4,                   /* mbmaxlen     */
2834   1,                   /* mbmaxlenlen  */
2835   0,                   /* min_sort_char */
2836   0xFFFF,              /* max_sort_char */
2837   ' ',                 /* pad char      */
2838   0,                   /* escape_with_backslash_is_dangerous */
2839   1,                   /* levels_for_compare */
2840   1,                   /* levels_for_order   */
2841   &my_charset_utf32_handler,
2842   &my_collation_utf32_general_ci_handler
2843 };
2844 
2845 
2846 CHARSET_INFO my_charset_utf32_bin=
2847 {
2848   61,0,0,              /* number       */
2849   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2850   "utf32",             /* cs name    */
2851   "utf32_bin",         /* name         */
2852   "UTF-32 Unicode",    /* comment      */
2853   NULL,                /* tailoring    */
2854   NULL,                /* ctype        */
2855   NULL,                /* to_lower     */
2856   NULL,                /* to_upper     */
2857   NULL,                /* sort_order   */
2858   NULL,                /* uca          */
2859   NULL,                /* tab_to_uni   */
2860   NULL,                /* tab_from_uni */
2861   &my_unicase_default, /* caseinfo     */
2862   NULL,                /* state_map    */
2863   NULL,                /* ident_map    */
2864   1,                   /* strxfrm_multiply */
2865   1,                   /* caseup_multiply  */
2866   1,                   /* casedn_multiply  */
2867   4,                   /* mbminlen     */
2868   4,                   /* mbmaxlen     */
2869   1,                   /* mbmaxlenlen  */
2870   0,                   /* min_sort_char */
2871   0xFFFF,              /* max_sort_char */
2872   ' ',                 /* pad char      */
2873   0,                   /* escape_with_backslash_is_dangerous */
2874   1,                   /* levels_for_compare */
2875   1,                   /* levels_for_order   */
2876   &my_charset_utf32_handler,
2877   &my_collation_utf32_bin_handler
2878 };
2879 
2880 
2881 #endif /* HAVE_CHARSET_utf32 */
2882 
2883 
2884 #ifdef HAVE_CHARSET_ucs2
2885 
2886 static const uchar ctype_ucs2[] = {
2887     0,
2888    32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
2889    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2890    72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
2891   132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
2892    16,129,129,129,129,129,129,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2893     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 16, 16, 16, 16, 16,
2894    16,130,130,130,130,130,130,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2895     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 16, 16, 16, 16, 32,
2896     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2897     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2898     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2899     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2900     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2901     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2902     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2903     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
2904 };
2905 
2906 static const uchar to_lower_ucs2[] = {
2907     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
2908    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2909    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2910    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2911    64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2912   112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
2913    96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2914   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2915   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2916   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2917   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2918   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2919   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2920   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2921   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2922   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2923 };
2924 
2925 static const uchar to_upper_ucs2[] = {
2926     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
2927    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2928    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2929    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2930    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2931    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2932    96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2933    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
2934   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2935   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2936   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2937   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2938   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2939   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2940   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2941   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2942 };
2943 
2944 
my_ucs2_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)2945 static int my_ucs2_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2946 		       my_wc_t * pwc, const uchar *s, const uchar *e)
2947 {
2948   if (s+2 > e) /* Need 2 characters */
2949     return MY_CS_TOOSMALL2;
2950 
2951   *pwc= ((uchar)s[0]) * 256  + ((uchar)s[1]);
2952   return 2;
2953 }
2954 
my_uni_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * r,uchar * e)2955 static int my_uni_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
2956 		       my_wc_t wc, uchar *r, uchar *e)
2957 {
2958   if ( r+2 > e )
2959     return MY_CS_TOOSMALL2;
2960 
2961   if (wc > 0xFFFF) /* UCS2 does not support characters outside BMP */
2962     return MY_CS_ILUNI;
2963 
2964   r[0]= (uchar) (wc >> 8);
2965   r[1]= (uchar) (wc & 0xFF);
2966   return 2;
2967 }
2968 
2969 
2970 static inline void
my_tolower_ucs2(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2971 my_tolower_ucs2(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2972 {
2973   const MY_UNICASE_CHARACTER *page;
2974   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2975     *wc= page[*wc & 0xFF].tolower;
2976 }
2977 
2978 
2979 static inline void
my_toupper_ucs2(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2980 my_toupper_ucs2(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2981 {
2982   const MY_UNICASE_CHARACTER *page;
2983   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2984     *wc= page[*wc & 0xFF].toupper;
2985 }
2986 
2987 
2988 static inline void
my_tosort_ucs2(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2989 my_tosort_ucs2(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2990 {
2991   const MY_UNICASE_CHARACTER *page;
2992   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2993     *wc= page[*wc & 0xFF].sort;
2994 }
2995 
2996 
my_caseup_ucs2(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2997 static size_t my_caseup_ucs2(const CHARSET_INFO *cs, char *src, size_t srclen,
2998                            char *dst MY_ATTRIBUTE((unused)),
2999                            size_t dstlen MY_ATTRIBUTE((unused)))
3000 {
3001   my_wc_t wc;
3002   int res;
3003   char *srcend= src + srclen;
3004   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3005   assert(src == dst && srclen == dstlen);
3006 
3007   while ((src < srcend) &&
3008          (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
3009   {
3010     my_toupper_ucs2(uni_plane, &wc);
3011     if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
3012       break;
3013     src+= res;
3014   }
3015   return srclen;
3016 }
3017 
3018 
my_hash_sort_ucs2(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)3019 static void my_hash_sort_ucs2(const CHARSET_INFO *cs, const uchar *s,
3020                               size_t slen, ulong *n1, ulong *n2)
3021 {
3022   my_wc_t wc;
3023   int res;
3024   const uchar *e=s+slen;
3025   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3026   ulong tmp1;
3027   ulong tmp2;
3028 
3029   while (e > s+1 && e[-1] == ' ' && e[-2] == '\0')
3030     e-= 2;
3031 
3032   tmp1= *n1;
3033   tmp2= *n2;
3034 
3035   while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
3036   {
3037     my_tosort_ucs2(uni_plane, &wc);
3038     tmp1^= (((tmp1 & 63) + tmp2) * (wc & 0xFF)) + (tmp1 << 8);
3039     tmp2+=3;
3040     tmp1^= (((tmp1 & 63) + tmp2) * (wc >> 8)) + (tmp1 << 8);
3041     tmp2+=3;
3042     s+=res;
3043   }
3044 
3045   *n1= tmp1;
3046   *n2= tmp2;
3047 }
3048 
3049 
my_casedn_ucs2(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))3050 static size_t my_casedn_ucs2(const CHARSET_INFO *cs, char *src, size_t srclen,
3051                            char *dst MY_ATTRIBUTE((unused)),
3052                            size_t dstlen MY_ATTRIBUTE((unused)))
3053 {
3054   my_wc_t wc;
3055   int res;
3056   char *srcend= src + srclen;
3057   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3058   assert(src == dst && srclen == dstlen);
3059 
3060   while ((src < srcend) &&
3061          (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
3062   {
3063     my_tolower_ucs2(uni_plane, &wc);
3064     if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
3065       break;
3066     src+= res;
3067   }
3068   return srclen;
3069 }
3070 
3071 
3072 static void
my_fill_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s,size_t l,int fill)3073 my_fill_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3074              char *s, size_t l, int fill)
3075 {
3076   assert(fill <= 0xFFFF);
3077   for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
3078 }
3079 
3080 
my_strnncoll_ucs2(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)3081 static int my_strnncoll_ucs2(const CHARSET_INFO *cs,
3082 			     const uchar *s, size_t slen,
3083                              const uchar *t, size_t tlen,
3084                              my_bool t_is_prefix)
3085 {
3086   int s_res,t_res;
3087   my_wc_t s_wc= 0, t_wc= 0;
3088   const uchar *se=s+slen;
3089   const uchar *te=t+tlen;
3090   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3091 
3092   while ( s < se && t < te )
3093   {
3094     s_res=my_ucs2_uni(cs,&s_wc, s, se);
3095     t_res=my_ucs2_uni(cs,&t_wc, t, te);
3096 
3097     if ( s_res <= 0 || t_res <= 0 )
3098     {
3099       /* Incorrect string, compare by char value */
3100       return ((int)s[0]-(int)t[0]);
3101     }
3102 
3103     my_tosort_ucs2(uni_plane, &s_wc);
3104     my_tosort_ucs2(uni_plane, &t_wc);
3105 
3106     if ( s_wc != t_wc )
3107     {
3108       return  s_wc > t_wc ? 1 : -1;
3109     }
3110 
3111     s+=s_res;
3112     t+=t_res;
3113   }
3114   return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
3115 }
3116 
3117 /*
3118   Compare strings, discarding end space
3119 
3120   SYNOPSIS
3121     my_strnncollsp_ucs2()
3122     cs                  character set handler
3123     a                   First string to compare
3124     a_length            Length of 'a'
3125     b                   Second string to compare
3126     b_length            Length of 'b'
3127 
3128   IMPLEMENTATION
3129     If one string is shorter as the other, then we space extend the other
3130     so that the strings have equal length.
3131 
3132     This will ensure that the following things hold:
3133 
3134     "a"  == "a "
3135     "a\0" < "a"
3136     "a\0" < "a "
3137 
3138   RETURN
3139     < 0  a <  b
3140     = 0  a == b
3141     > 0  a > b
3142 */
3143 
my_strnncollsp_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))3144 static int my_strnncollsp_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3145                                const uchar *s, size_t slen,
3146                                const uchar *t, size_t tlen,
3147                                my_bool diff_if_only_endspace_difference
3148 			       MY_ATTRIBUTE((unused)))
3149 {
3150   const uchar *se, *te;
3151   size_t minlen;
3152   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3153 
3154   /* extra safety to make sure the lengths are even numbers */
3155   slen&= ~1;
3156   tlen&= ~1;
3157 
3158   se= s + slen;
3159   te= t + tlen;
3160 
3161   for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
3162   {
3163     int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort :
3164                                        (((int) s[0]) << 8) + (int) s[1];
3165 
3166     int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort :
3167                                        (((int) t[0]) << 8) + (int) t[1];
3168     if ( s_wc != t_wc )
3169       return  s_wc > t_wc ? 1 : -1;
3170 
3171     s+= 2;
3172     t+= 2;
3173   }
3174 
3175   if (slen != tlen)
3176   {
3177     int swap= 1;
3178     if (slen < tlen)
3179     {
3180       s= t;
3181       se= te;
3182       swap= -1;
3183     }
3184 
3185     for ( ; s < se ; s+= 2)
3186     {
3187       if (s[0] || s[1] != ' ')
3188         return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
3189     }
3190   }
3191   return 0;
3192 }
3193 
3194 
my_ismbchar_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)))3195 static uint my_ismbchar_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3196                              const char *b MY_ATTRIBUTE((unused)),
3197                              const char *e MY_ATTRIBUTE((unused)))
3198 {
3199   return 2;
3200 }
3201 
3202 
my_mbcharlen_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))3203 static uint my_mbcharlen_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
3204                               uint c MY_ATTRIBUTE((unused)))
3205 {
3206   return 2;
3207 }
3208 
3209 
3210 static
my_numchars_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e)3211 size_t my_numchars_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3212                         const char *b, const char *e)
3213 {
3214   return (size_t) (e-b)/2;
3215 }
3216 
3217 
3218 static
my_charpos_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)),size_t pos)3219 size_t my_charpos_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3220                        const char *b  MY_ATTRIBUTE((unused)),
3221                        const char *e  MY_ATTRIBUTE((unused)),
3222                        size_t pos)
3223 {
3224   size_t string_length= (size_t) (e - b);
3225   return pos > string_length ? string_length + 2 : pos * 2;
3226 }
3227 
3228 
3229 static
my_well_formed_len_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t nchars,int * error)3230 size_t my_well_formed_len_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3231                                const char *b, const char *e,
3232                                size_t nchars, int *error)
3233 {
3234   /* Ensure string length is dividable with 2 */
3235   size_t nbytes= ((size_t) (e-b)) & ~(size_t) 1;
3236   *error= 0;
3237   nchars*= 2;
3238   return MY_MIN(nbytes, nchars);
3239 }
3240 
3241 
3242 static
my_wildcmp_ucs2_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3243 int my_wildcmp_ucs2_ci(const CHARSET_INFO *cs,
3244 		    const char *str,const char *str_end,
3245 		    const char *wildstr,const char *wildend,
3246 		    int escape, int w_one, int w_many)
3247 {
3248   const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3249   return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3250                             escape,w_one,w_many,uni_plane);
3251 }
3252 
3253 
3254 static
my_wildcmp_ucs2_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3255 int my_wildcmp_ucs2_bin(const CHARSET_INFO *cs,
3256 		    const char *str,const char *str_end,
3257 		    const char *wildstr,const char *wildend,
3258 		    int escape, int w_one, int w_many)
3259 {
3260   return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3261                             escape,w_one,w_many,NULL);
3262 }
3263 
3264 
3265 static
my_strnncoll_ucs2_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)3266 int my_strnncoll_ucs2_bin(const CHARSET_INFO *cs,
3267                           const uchar *s, size_t slen,
3268                           const uchar *t, size_t tlen,
3269                           my_bool t_is_prefix)
3270 {
3271   int s_res,t_res;
3272   my_wc_t s_wc= 0, t_wc= 0;
3273   const uchar *se=s+slen;
3274   const uchar *te=t+tlen;
3275 
3276   while ( s < se && t < te )
3277   {
3278     s_res=my_ucs2_uni(cs,&s_wc, s, se);
3279     t_res=my_ucs2_uni(cs,&t_wc, t, te);
3280 
3281     if ( s_res <= 0 || t_res <= 0 )
3282     {
3283       /* Incorrect string, compare by char value */
3284       return ((int)s[0]-(int)t[0]);
3285     }
3286     if ( s_wc != t_wc )
3287     {
3288       return  s_wc > t_wc ? 1 : -1;
3289     }
3290 
3291     s+=s_res;
3292     t+=t_res;
3293   }
3294   return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
3295 }
3296 
my_strnncollsp_ucs2_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))3297 static int my_strnncollsp_ucs2_bin(const CHARSET_INFO *cs
3298                                    MY_ATTRIBUTE((unused)),
3299                                    const uchar *s, size_t slen,
3300                                    const uchar *t, size_t tlen,
3301                                    my_bool diff_if_only_endspace_difference
3302                                    MY_ATTRIBUTE((unused)))
3303 {
3304   const uchar *se, *te;
3305   size_t minlen;
3306 
3307   /* extra safety to make sure the lengths are even numbers */
3308   slen= (slen >> 1) << 1;
3309   tlen= (tlen >> 1) << 1;
3310 
3311   se= s + slen;
3312   te= t + tlen;
3313 
3314   for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
3315   {
3316     int s_wc= s[0] * 256 + s[1];
3317     int t_wc= t[0] * 256 + t[1];
3318     if ( s_wc != t_wc )
3319       return  s_wc > t_wc ? 1 : -1;
3320 
3321     s+= 2;
3322     t+= 2;
3323   }
3324 
3325   if (slen != tlen)
3326   {
3327     int swap= 1;
3328     if (slen < tlen)
3329     {
3330       s= t;
3331       se= te;
3332       swap= -1;
3333     }
3334 
3335     for ( ; s < se ; s+= 2)
3336     {
3337       if (s[0] || s[1] != ' ')
3338         return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
3339     }
3340   }
3341   return 0;
3342 }
3343 
3344 
3345 static
my_hash_sort_ucs2_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * key,size_t len,ulong * nr1,ulong * nr2)3346 void my_hash_sort_ucs2_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3347 			   const uchar *key, size_t len,ulong *nr1, ulong *nr2)
3348 {
3349   const uchar *pos = key;
3350   ulong tmp1;
3351   ulong tmp2;
3352 
3353   key+= len;
3354 
3355   while (key > pos+1 && key[-1] == ' ' && key[-2] == '\0')
3356     key-= 2;
3357 
3358   tmp1= *nr1;
3359   tmp2= *nr2;
3360 
3361   for (; pos < (uchar*) key ; pos++)
3362   {
3363     tmp1^=(ulong) ((((uint) tmp1 & 63) + tmp2) *
3364      ((uint)*pos)) + (tmp1 << 8);
3365     tmp2+=3;
3366   }
3367 
3368   *nr1= tmp1;
3369   *nr2= tmp2;
3370 }
3371 
3372 
3373 static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
3374 {
3375     NULL,		/* init */
3376     my_strnncoll_ucs2,
3377     my_strnncollsp_ucs2,
3378     my_strnxfrm_unicode,
3379     my_strnxfrmlen_simple,
3380     my_like_range_generic,
3381     my_wildcmp_ucs2_ci,
3382     my_strcasecmp_mb2_or_mb4,
3383     my_instr_mb,
3384     my_hash_sort_ucs2,
3385     my_propagate_simple
3386 };
3387 
3388 
3389 static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
3390 {
3391     NULL,		/* init */
3392     my_strnncoll_ucs2_bin,
3393     my_strnncollsp_ucs2_bin,
3394     my_strnxfrm_unicode,
3395     my_strnxfrmlen_simple,
3396     my_like_range_generic,
3397     my_wildcmp_ucs2_bin,
3398     my_strcasecmp_mb2_or_mb4,
3399     my_instr_mb,
3400     my_hash_sort_ucs2_bin,
3401     my_propagate_simple
3402 };
3403 
3404 
3405 MY_CHARSET_HANDLER my_charset_ucs2_handler=
3406 {
3407     NULL,		/* init */
3408     my_ismbchar_ucs2,	/* ismbchar     */
3409     my_mbcharlen_ucs2,	/* mbcharlen    */
3410     my_numchars_ucs2,
3411     my_charpos_ucs2,
3412     my_well_formed_len_ucs2,
3413     my_lengthsp_mb2,
3414     my_numcells_mb,
3415     my_ucs2_uni,	/* mb_wc        */
3416     my_uni_ucs2,	/* wc_mb        */
3417     my_mb_ctype_mb,
3418     my_caseup_str_mb2_or_mb4,
3419     my_casedn_str_mb2_or_mb4,
3420     my_caseup_ucs2,
3421     my_casedn_ucs2,
3422     my_snprintf_mb2,
3423     my_l10tostr_mb2_or_mb4,
3424     my_ll10tostr_mb2_or_mb4,
3425     my_fill_ucs2,
3426     my_strntol_mb2_or_mb4,
3427     my_strntoul_mb2_or_mb4,
3428     my_strntoll_mb2_or_mb4,
3429     my_strntoull_mb2_or_mb4,
3430     my_strntod_mb2_or_mb4,
3431     my_strtoll10_mb2,
3432     my_strntoull10rnd_mb2_or_mb4,
3433     my_scan_mb2
3434 };
3435 
3436 
3437 CHARSET_INFO my_charset_ucs2_general_ci=
3438 {
3439     35,0,0,		/* number       */
3440     MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
3441     "ucs2",		/* cs name    */
3442     "ucs2_general_ci",	/* name         */
3443     "",			/* comment      */
3444     NULL,		/* tailoring    */
3445     ctype_ucs2,		/* ctype        */
3446     to_lower_ucs2,	/* to_lower     */
3447     to_upper_ucs2,	/* to_upper     */
3448     to_upper_ucs2,	/* sort_order   */
3449     NULL,		/* uca          */
3450     NULL,		/* tab_to_uni   */
3451     NULL,		/* tab_from_uni */
3452     &my_unicase_default,/* caseinfo     */
3453     NULL,		/* state_map    */
3454     NULL,		/* ident_map    */
3455     1,			/* strxfrm_multiply */
3456     1,                  /* caseup_multiply  */
3457     1,                  /* casedn_multiply  */
3458     2,			/* mbminlen     */
3459     2,			/* mbmaxlen     */
3460     1,			/* mbmaxlenlen  */
3461     0,			/* min_sort_char */
3462     0xFFFF,		/* max_sort_char */
3463     ' ',                /* pad char      */
3464     0,                  /* escape_with_backslash_is_dangerous */
3465     1,                  /* levels_for_compare */
3466     1,                  /* levels_for_order   */
3467     &my_charset_ucs2_handler,
3468     &my_collation_ucs2_general_ci_handler
3469 };
3470 
3471 
3472 CHARSET_INFO my_charset_ucs2_general_mysql500_ci=
3473 {
3474   159, 0, 0,                                       /* number           */
3475   MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, /* state */
3476   "ucs2",                                          /* cs name          */
3477   "ucs2_general_mysql500_ci",                      /* name             */
3478   "",                                              /* comment          */
3479   NULL,                                            /* tailoring        */
3480   ctype_ucs2,                                      /* ctype            */
3481   to_lower_ucs2,                                   /* to_lower         */
3482   to_upper_ucs2,                                   /* to_upper         */
3483   to_upper_ucs2,                                   /* sort_order       */
3484   NULL,                                            /* uca              */
3485   NULL,                                            /* tab_to_uni       */
3486   NULL,                                            /* tab_from_uni     */
3487   &my_unicase_mysql500,                            /* caseinfo         */
3488   NULL,                                            /* state_map        */
3489   NULL,                                            /* ident_map        */
3490   1,                                               /* strxfrm_multiply */
3491   1,                                               /* caseup_multiply  */
3492   1,                                               /* casedn_multiply  */
3493   2,                                               /* mbminlen         */
3494   2,                                               /* mbmaxlen         */
3495   1,                                               /* mbmaxlenlen      */
3496   0,                                               /* min_sort_char    */
3497   0xFFFF,                                          /* max_sort_char    */
3498   ' ',                                             /* pad char         */
3499   0,                          /* escape_with_backslash_is_dangerous    */
3500   1,                                               /* levels_for_compare */
3501   1,                                               /* levels_for_order   */
3502   &my_charset_ucs2_handler,
3503   &my_collation_ucs2_general_ci_handler
3504 };
3505 
3506 
3507 CHARSET_INFO my_charset_ucs2_bin=
3508 {
3509     90,0,0,		/* number       */
3510     MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
3511     "ucs2",		/* cs name    */
3512     "ucs2_bin",		/* name         */
3513     "",			/* comment      */
3514     NULL,		/* tailoring    */
3515     ctype_ucs2,		/* ctype        */
3516     to_lower_ucs2,	/* to_lower     */
3517     to_upper_ucs2,	/* to_upper     */
3518     NULL,		/* sort_order   */
3519     NULL,		/* uca          */
3520     NULL,		/* tab_to_uni   */
3521     NULL,		/* tab_from_uni */
3522     &my_unicase_default,/* caseinfo     */
3523     NULL,		/* state_map    */
3524     NULL,		/* ident_map    */
3525     1,			/* strxfrm_multiply */
3526     1,                  /* caseup_multiply  */
3527     1,                  /* casedn_multiply  */
3528     2,			/* mbminlen     */
3529     2,			/* mbmaxlen     */
3530     1,			/* mbmaxlenlen  */
3531     0,			/* min_sort_char */
3532     0xFFFF,		/* max_sort_char */
3533     ' ',                /* pad char      */
3534     0,                  /* escape_with_backslash_is_dangerous */
3535     1,                  /* levels_for_compare */
3536     1,                  /* levels_for_order   */
3537     &my_charset_ucs2_handler,
3538     &my_collation_ucs2_bin_handler
3539 };
3540 
3541 
3542 #endif /* HAVE_CHARSET_ucs2 */
3543