1 /* Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
2 
3    This library is free software; you can redistribute it and/or
4    modify it under the terms of the GNU Library General Public
5    License as published by the Free Software Foundation; version 2
6    of the License.
7 
8    This library is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11    Library General Public License for more details.
12 
13    You should have received a copy of the GNU Library General Public
14    License along with this library; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
16 
17 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
18 
19 #include <my_global.h>
20 #include <my_sys.h>
21 #include "m_string.h"
22 #include "m_ctype.h"
23 #include <errno.h>
24 #include <stdarg.h>
25 
26 
27 #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
28 #define HAVE_CHARSET_mb2
29 #endif
30 
31 
32 #if defined(HAVE_CHARSET_mb2) || defined(HAVE_CHARSET_utf32)
33 #define HAVE_CHARSET_mb2_or_mb4
34 #endif
35 
36 
37 #ifndef EILSEQ
38 #define EILSEQ ENOENT
39 #endif
40 
41 #undef  ULONGLONG_MAX
42 #define ULONGLONG_MAX                (~(ulonglong) 0)
43 #define MAX_NEGATIVE_NUMBER        ((ulonglong) LL(0x8000000000000000))
44 #define INIT_CNT  9
45 #define LFACTOR   ULL(1000000000)
46 #define LFACTOR1  ULL(10000000000)
47 #define LFACTOR2  ULL(100000000000)
48 
49 static unsigned long lfactor[9]=
50 { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L };
51 
52 
53 
54 #ifdef HAVE_CHARSET_mb2_or_mb4
55 static inline int
my_bincmp(const uchar * s,const uchar * se,const uchar * t,const uchar * te)56 my_bincmp(const uchar *s, const uchar *se,
57           const uchar *t, const uchar *te)
58 {
59   int slen= (int) (se - s), tlen= (int) (te - t);
60   int len= MY_MIN(slen, tlen);
61   int cmp= memcmp(s, t, len);
62   return cmp ? cmp : slen - tlen;
63 }
64 
65 
66 static size_t
my_caseup_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s MY_ATTRIBUTE ((unused)))67 my_caseup_str_mb2_or_mb4(const CHARSET_INFO * cs  MY_ATTRIBUTE((unused)),
68                          char * s MY_ATTRIBUTE((unused)))
69 {
70   DBUG_ASSERT(0);
71   return 0;
72 }
73 
74 
75 static size_t
my_casedn_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s MY_ATTRIBUTE ((unused)))76 my_casedn_str_mb2_or_mb4(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
77                          char * s MY_ATTRIBUTE((unused)))
78 {
79   DBUG_ASSERT(0);
80   return 0;
81 }
82 
83 
84 static int
my_strcasecmp_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * s MY_ATTRIBUTE ((unused)),const char * t MY_ATTRIBUTE ((unused)))85 my_strcasecmp_mb2_or_mb4(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
86                          const char *s MY_ATTRIBUTE((unused)),
87                          const char *t MY_ATTRIBUTE((unused)))
88 {
89   DBUG_ASSERT(0);
90   return 0;
91 }
92 
93 
94 static long
my_strntol_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)95 my_strntol_mb2_or_mb4(const CHARSET_INFO *cs,
96                       const char *nptr, size_t l, int base,
97                       char **endptr, int *err)
98 {
99   int      negative= 0;
100   int      overflow;
101   int      cnv;
102   my_wc_t  wc;
103   unsigned int cutlim;
104   uint32 cutoff;
105   uint32 res;
106   const uchar *s= (const uchar*) nptr;
107   const uchar *e= (const uchar*) nptr+l;
108   const uchar *save;
109 
110   *err= 0;
111   do
112   {
113     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e))>0)
114     {
115       switch (wc)
116       {
117         case ' ' : break;
118         case '\t': break;
119         case '-' : negative= !negative; break;
120         case '+' : break;
121         default  : goto bs;
122       }
123     }
124     else /* No more characters or bad multibyte sequence */
125     {
126       if (endptr != NULL )
127         *endptr= (char*) s;
128       err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
129       return 0;
130     }
131     s+= cnv;
132   } while (1);
133 
134 bs:
135 
136   overflow= 0;
137   res= 0;
138   save= s;
139   cutoff= ((uint32)~0L) / (uint32) base;
140   cutlim= (uint) (((uint32)~0L) % (uint32) base);
141 
142   do {
143     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
144     {
145       s+= cnv;
146       if (wc >= '0' && wc <= '9')
147         wc-= '0';
148       else if (wc >= 'A' && wc <= 'Z')
149         wc= wc - 'A' + 10;
150       else if (wc >= 'a' && wc <= 'z')
151         wc= wc - 'a' + 10;
152       else
153         break;
154       if ((int)wc >= base)
155         break;
156       if (res > cutoff || (res == cutoff && wc > cutlim))
157         overflow= 1;
158       else
159       {
160         res*= (uint32) base;
161         res+= wc;
162       }
163     }
164     else if (cnv == MY_CS_ILSEQ)
165     {
166       if (endptr !=NULL )
167         *endptr = (char*) s;
168       err[0]= EILSEQ;
169       return 0;
170     }
171     else
172     {
173       /* No more characters */
174       break;
175     }
176   } while(1);
177 
178   if (endptr != NULL)
179     *endptr = (char *) s;
180 
181   if (s == save)
182   {
183     err[0]= EDOM;
184     return 0L;
185   }
186 
187   if (negative)
188   {
189     if (res > (uint32) INT_MIN32)
190       overflow= 1;
191   }
192   else if (res > INT_MAX32)
193     overflow= 1;
194 
195   if (overflow)
196   {
197     err[0]= ERANGE;
198     return negative ? INT_MIN32 : INT_MAX32;
199   }
200 
201   return (negative ? -((long) res) : (long) res);
202 }
203 
204 
205 static ulong
my_strntoul_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)206 my_strntoul_mb2_or_mb4(const CHARSET_INFO *cs,
207                        const char *nptr, size_t l, int base,
208                        char **endptr, int *err)
209 {
210   int      negative= 0;
211   int      overflow;
212   int      cnv;
213   my_wc_t  wc;
214   unsigned int cutlim;
215   uint32 cutoff;
216   uint32 res;
217   const uchar *s= (const uchar*) nptr;
218   const uchar *e= (const uchar*) nptr + l;
219   const uchar *save;
220 
221   *err= 0;
222   do
223   {
224     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
225     {
226       switch (wc)
227       {
228         case ' ' : break;
229         case '\t': break;
230         case '-' : negative= !negative; break;
231         case '+' : break;
232         default  : goto bs;
233       }
234     }
235     else /* No more characters or bad multibyte sequence */
236     {
237       if (endptr !=NULL )
238         *endptr= (char*)s;
239       err[0]= (cnv == MY_CS_ILSEQ) ? EILSEQ : EDOM;
240       return 0;
241     }
242     s+= cnv;
243   } while (1);
244 
245 bs:
246 
247   overflow= 0;
248   res= 0;
249   save= s;
250   cutoff= ((uint32)~0L) / (uint32) base;
251   cutlim= (uint) (((uint32)~0L) % (uint32) base);
252 
253   do
254   {
255     if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
256     {
257       s+= cnv;
258       if (wc >= '0' && wc <= '9')
259         wc-= '0';
260       else if (wc >= 'A' && wc <= 'Z')
261         wc= wc - 'A' + 10;
262       else if (wc >= 'a' && wc <= 'z')
263         wc= wc - 'a' + 10;
264       else
265         break;
266       if ((int) wc >= base)
267         break;
268       if (res > cutoff || (res == cutoff && wc > cutlim))
269         overflow = 1;
270       else
271       {
272         res*= (uint32) base;
273         res+= wc;
274       }
275     }
276     else if (cnv == MY_CS_ILSEQ)
277     {
278       if (endptr != NULL )
279         *endptr= (char*)s;
280       err[0]= EILSEQ;
281       return 0;
282     }
283     else
284     {
285       /* No more characters */
286       break;
287     }
288   } while(1);
289 
290   if (endptr != NULL)
291     *endptr= (char *) s;
292 
293   if (s == save)
294   {
295     err[0]= EDOM;
296     return 0L;
297   }
298 
299   if (overflow)
300   {
301     err[0]= (ERANGE);
302     return (~(uint32) 0);
303   }
304 
305   return (negative ? -((long) res) : (long) res);
306 }
307 
308 
309 static longlong
my_strntoll_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)310 my_strntoll_mb2_or_mb4(const CHARSET_INFO *cs,
311                        const char *nptr, size_t l, int base,
312                        char **endptr, int *err)
313 {
314   int      negative=0;
315   int      overflow;
316   int      cnv;
317   my_wc_t  wc;
318   ulonglong    cutoff;
319   unsigned int cutlim;
320   ulonglong    res;
321   const uchar *s= (const uchar*) nptr;
322   const uchar *e= (const uchar*) nptr+l;
323   const uchar *save;
324 
325   *err= 0;
326   do
327   {
328     if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
329     {
330       switch (wc)
331       {
332         case ' ' : break;
333         case '\t': break;
334         case '-' : negative= !negative; break;
335         case '+' : break;
336         default  : goto bs;
337       }
338     }
339     else /* No more characters or bad multibyte sequence */
340     {
341       if (endptr !=NULL )
342         *endptr = (char*)s;
343       err[0] = (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
344       return 0;
345     }
346     s+=cnv;
347   } while (1);
348 
349 bs:
350 
351   overflow = 0;
352   res = 0;
353   save = s;
354   cutoff = (~(ulonglong) 0) / (unsigned long int) base;
355   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
356 
357   do {
358     if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
359     {
360       s+=cnv;
361       if ( wc>='0' && wc<='9')
362         wc -= '0';
363       else if ( wc>='A' && wc<='Z')
364         wc = wc - 'A' + 10;
365       else if ( wc>='a' && wc<='z')
366         wc = wc - 'a' + 10;
367       else
368         break;
369       if ((int)wc >= base)
370         break;
371       if (res > cutoff || (res == cutoff && wc > cutlim))
372         overflow = 1;
373       else
374       {
375         res *= (ulonglong) base;
376         res += wc;
377       }
378     }
379     else if (cnv==MY_CS_ILSEQ)
380     {
381       if (endptr !=NULL )
382         *endptr = (char*)s;
383       err[0]=EILSEQ;
384       return 0;
385     }
386     else
387     {
388       /* No more characters */
389       break;
390     }
391   } while(1);
392 
393   if (endptr != NULL)
394     *endptr = (char *) s;
395 
396   if (s == save)
397   {
398     err[0]=EDOM;
399     return 0L;
400   }
401 
402   if (negative)
403   {
404     if (res  > (ulonglong) LONGLONG_MIN)
405       overflow = 1;
406   }
407   else if (res > (ulonglong) LONGLONG_MAX)
408     overflow = 1;
409 
410   if (overflow)
411   {
412     err[0]=ERANGE;
413     return negative ? LONGLONG_MIN : LONGLONG_MAX;
414   }
415 
416   return (negative ? -((longlong)res) : (longlong)res);
417 }
418 
419 
420 static ulonglong
my_strntoull_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)421 my_strntoull_mb2_or_mb4(const CHARSET_INFO *cs,
422                         const char *nptr, size_t l, int base,
423                         char **endptr, int *err)
424 {
425   int      negative= 0;
426   int      overflow;
427   int      cnv;
428   my_wc_t  wc;
429   ulonglong    cutoff;
430   unsigned int cutlim;
431   ulonglong    res;
432   const uchar *s= (const uchar*) nptr;
433   const uchar *e= (const uchar*) nptr + l;
434   const uchar *save;
435 
436   *err= 0;
437   do
438   {
439     if ((cnv= cs->cset->mb_wc(cs,&wc,s,e)) > 0)
440     {
441       switch (wc)
442       {
443         case ' ' : break;
444         case '\t': break;
445         case '-' : negative= !negative; break;
446         case '+' : break;
447         default  : goto bs;
448       }
449     }
450     else /* No more characters or bad multibyte sequence */
451     {
452       if (endptr !=NULL )
453         *endptr = (char*)s;
454       err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
455       return 0;
456     }
457     s+=cnv;
458   } while (1);
459 
460 bs:
461 
462   overflow = 0;
463   res = 0;
464   save = s;
465   cutoff = (~(ulonglong) 0) / (unsigned long int) base;
466   cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
467 
468   do
469   {
470     if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
471     {
472       s+=cnv;
473       if ( wc>='0' && wc<='9')
474         wc -= '0';
475       else if ( wc>='A' && wc<='Z')
476         wc = wc - 'A' + 10;
477       else if ( wc>='a' && wc<='z')
478         wc = wc - 'a' + 10;
479       else
480         break;
481       if ((int)wc >= base)
482         break;
483       if (res > cutoff || (res == cutoff && wc > cutlim))
484         overflow = 1;
485       else
486       {
487         res *= (ulonglong) base;
488         res += wc;
489       }
490     }
491     else if (cnv==MY_CS_ILSEQ)
492     {
493       if (endptr !=NULL )
494         *endptr = (char*)s;
495       err[0]= EILSEQ;
496       return 0;
497     }
498     else
499     {
500       /* No more characters */
501       break;
502     }
503   } while(1);
504 
505   if (endptr != NULL)
506     *endptr = (char *) s;
507 
508   if (s == save)
509   {
510     err[0]= EDOM;
511     return 0L;
512   }
513 
514   if (overflow)
515   {
516     err[0]= ERANGE;
517     return (~(ulonglong) 0);
518   }
519 
520   return (negative ? -((longlong) res) : (longlong) res);
521 }
522 
523 
524 static double
my_strntod_mb2_or_mb4(const CHARSET_INFO * cs,char * nptr,size_t length,char ** endptr,int * err)525 my_strntod_mb2_or_mb4(const CHARSET_INFO *cs,
526                       char *nptr, size_t length,
527                       char **endptr, int *err)
528 {
529   char     buf[256];
530   double   res;
531   char *b= buf;
532   const uchar *s= (const uchar*) nptr;
533   const uchar *end;
534   my_wc_t  wc;
535   int     cnv;
536 
537   *err= 0;
538   /* Cut too long strings */
539   if (length >= sizeof(buf))
540     length= sizeof(buf) - 1;
541   end= s + length;
542 
543   while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
544   {
545     s+= cnv;
546     if (wc > (int) (uchar) 'e' || !wc)
547       break;                                        /* Can't be part of double */
548     *b++= (char) wc;
549   }
550 
551   *endptr= b;
552   res= my_strtod(buf, endptr, err);
553   *endptr= nptr + cs->mbminlen * (size_t) (*endptr - buf);
554   return res;
555 }
556 
557 
558 static ulonglong
my_strntoull10rnd_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t length,int unsign_fl,char ** endptr,int * err)559 my_strntoull10rnd_mb2_or_mb4(const CHARSET_INFO *cs,
560                              const char *nptr, size_t length,
561                              int unsign_fl,
562                              char **endptr, int *err)
563 {
564   char  buf[256], *b= buf;
565   ulonglong res;
566   const uchar *end, *s= (const uchar*) nptr;
567   my_wc_t  wc;
568   int     cnv;
569 
570   /* Cut too long strings */
571   if (length >= sizeof(buf))
572     length= sizeof(buf)-1;
573   end= s + length;
574 
575   while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
576   {
577     s+= cnv;
578     if (wc > (int) (uchar) 'e' || !wc)
579       break;                            /* Can't be a number part */
580     *b++= (char) wc;
581   }
582 
583   res= my_strntoull10rnd_8bit(cs, buf, b - buf, unsign_fl, endptr, err);
584   *endptr= (char*) nptr + cs->mbminlen * (size_t) (*endptr - buf);
585   return res;
586 }
587 
588 
589 /*
590   This is a fast version optimized for the case of radix 10 / -10
591 */
592 
593 static size_t
my_l10tostr_mb2_or_mb4(const CHARSET_INFO * cs,char * dst,size_t len,int radix,long int val)594 my_l10tostr_mb2_or_mb4(const CHARSET_INFO *cs,
595                        char *dst, size_t len, int radix, long int val)
596 {
597   char buffer[66];
598   char *p, *db, *de;
599   long int new_val;
600   int  sl= 0;
601   unsigned long int uval = (unsigned long int) val;
602 
603   p= &buffer[sizeof(buffer) - 1];
604   *p= '\0';
605 
606   if (radix < 0)
607   {
608     if (val < 0)
609     {
610       sl= 1;
611       /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
612       uval  = (unsigned long int)0 - uval;
613     }
614   }
615 
616   new_val = (long) (uval / 10);
617   *--p    = '0'+ (char) (uval - (unsigned long) new_val * 10);
618   val= new_val;
619 
620   while (val != 0)
621   {
622     new_val= val / 10;
623     *--p= '0' + (char) (val - new_val * 10);
624     val= new_val;
625   }
626 
627   if (sl)
628   {
629     *--p= '-';
630   }
631 
632   for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
633   {
634     int cnvres= cs->cset->wc_mb(cs,(my_wc_t)p[0],(uchar*) dst, (uchar*) de);
635     if (cnvres > 0)
636       dst+= cnvres;
637     else
638       break;
639   }
640   return (int) (dst - db);
641 }
642 
643 
644 static size_t
my_ll10tostr_mb2_or_mb4(const CHARSET_INFO * cs,char * dst,size_t len,int radix,longlong val)645 my_ll10tostr_mb2_or_mb4(const CHARSET_INFO *cs,
646                         char *dst, size_t len, int radix, longlong val)
647 {
648   char buffer[65];
649   char *p, *db, *de;
650   long long_val;
651   int sl= 0;
652   ulonglong uval= (ulonglong) val;
653 
654   if (radix < 0)
655   {
656     if (val < 0)
657     {
658       sl= 1;
659       /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
660       uval = (ulonglong)0 - uval;
661     }
662   }
663 
664   p= &buffer[sizeof(buffer)-1];
665   *p='\0';
666 
667   if (uval == 0)
668   {
669     *--p= '0';
670     goto cnv;
671   }
672 
673   while (uval > (ulonglong) LONG_MAX)
674   {
675     ulonglong quo= uval/(uint) 10;
676     uint rem= (uint) (uval- quo* (uint) 10);
677     *--p= '0' + rem;
678     uval= quo;
679   }
680 
681   long_val= (long) uval;
682   while (long_val != 0)
683   {
684     long quo= long_val/10;
685     *--p= (char) ('0' + (long_val - quo*10));
686     long_val= quo;
687   }
688 
689 cnv:
690   if (sl)
691   {
692     *--p= '-';
693   }
694 
695   for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
696   {
697     int cnvres= cs->cset->wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
698     if (cnvres > 0)
699       dst+= cnvres;
700     else
701       break;
702   }
703   return (int) (dst -db);
704 }
705 
706 #endif /* HAVE_CHARSET_mb2_or_mb4 */
707 
708 
709 #ifdef HAVE_CHARSET_mb2
710 static longlong
my_strtoll10_mb2(const CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)711 my_strtoll10_mb2(const CHARSET_INFO *cs,
712                  const char *nptr, char **endptr, int *error)
713 {
714   const char *s, *end, *start, *n_end, *true_end;
715   uchar c;
716   unsigned long i, j, k;
717   ulonglong li;
718   int negative;
719   ulong cutoff, cutoff2, cutoff3;
720   my_wc_t wc;
721   int res;
722 
723   s= nptr;
724   /* If fixed length string */
725   if (endptr)
726   {
727     /*
728       Make sure string length is even.
729       Odd length indicates a bug in the caller.
730       Assert in debug, round in production.
731     */
732     DBUG_ASSERT((*endptr - s) % 2 == 0);
733     end= s + ((*endptr - s) / 2) * 2;
734 
735     for ( ; ; ) /* Skip leading spaces and tabs */
736     {
737       res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
738       if (res <= 0)
739         goto no_conv;
740       s+= res;
741       if (wc != ' ' && wc != '\t')
742         break;
743     }
744   }
745   else
746   {
747      /* We don't support null terminated strings in UCS2 */
748      goto no_conv;
749   }
750 
751   /* Check for a sign. */
752   negative= 0;
753   if (wc == '-')
754   {
755     *error= -1;                                        /* Mark as negative number */
756     negative= 1;
757     res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
758     if (res < 0)
759       goto no_conv;
760     s+= res;
761     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
762     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
763     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
764   }
765   else
766   {
767     *error= 0;
768     if (wc == '+')
769     {
770       res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
771       if (res < 0)
772         goto no_conv;
773       s+= res;
774     }
775     cutoff=  ULONGLONG_MAX / LFACTOR2;
776     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
777     cutoff3=  ULONGLONG_MAX % 100;
778   }
779 
780 
781   /* Handle case where we have a lot of pre-zero */
782   if (wc == '0')
783   {
784     i= 0;
785     for ( ; ; s+= res)
786     {
787       if (s == end)
788         goto end_i;                                /* Return 0 */
789       res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
790       if (res < 0)
791         goto no_conv;
792       if (wc != '0')
793         break;
794     }
795     while (wc == '0');
796     n_end= s + 2 * INIT_CNT;
797   }
798   else
799   {
800     /* Read first digit to check that it's a valid number */
801     if ((c= (wc - '0')) > 9)
802       goto no_conv;
803     i= c;
804     n_end= s + 2 * (INIT_CNT-1);
805   }
806 
807   /* Handle first 9 digits and store them in i */
808   if (n_end > end)
809     n_end= end;
810   for ( ; ; )
811   {
812     res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) n_end);
813     if (res < 0)
814       break;
815     s+= res;
816     if ((c= (wc - '0')) > 9)
817       goto end_i;
818     i= i*10+c;
819   }
820   if (s == end)
821     goto end_i;
822 
823   /* Handle next 9 digits and store them in j */
824   j= 0;
825   start= s;                                /* Used to know how much to shift i */
826   n_end= true_end= s + 2 * INIT_CNT;
827   if (n_end > end)
828     n_end= end;
829   do
830   {
831     res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
832     if (res < 0)
833       goto no_conv;
834     s+= res;
835     if ((c= (wc - '0')) > 9)
836       goto end_i_and_j;
837     j= j*10+c;
838   } while (s != n_end);
839   if (s == end)
840   {
841     if (s != true_end)
842       goto end_i_and_j;
843     goto end3;
844   }
845   res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
846   if (res < 0)
847     goto no_conv;
848   s+= res;
849   if ((c= (wc - '0')) > 9)
850     goto end3;
851 
852   /* Handle the next 1 or 2 digits and store them in k */
853   k=c;
854   if (s == end)
855     goto end4;
856   res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
857   if (res < 0)
858     goto no_conv;
859   s+= res;
860   if ((c= (wc - '0')) > 9)
861     goto end4;
862   k= k*10+c;
863   *endptr= (char*) s;
864 
865   /* number string should have ended here */
866   if (s != end && (c= (wc - '0')) <= 9)
867     goto overflow;
868 
869   /* Check that we didn't get an overflow with the last digit */
870   if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
871                                      k > cutoff3)))
872     goto overflow;
873   li=i*LFACTOR2+ (ulonglong) j*100 + k;
874   return (longlong) li;
875 
876 overflow:                                        /* *endptr is set here */
877   *error= MY_ERRNO_ERANGE;
878   return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
879 
880 end_i:
881   *endptr= (char*) s;
882   return (negative ? ((longlong) -(long) i) : (longlong) i);
883 
884 end_i_and_j:
885   li= (ulonglong) i * lfactor[(size_t) (s-start) / 2] + j;
886   *endptr= (char*) s;
887   return (negative ? -((longlong) li) : (longlong) li);
888 
889 end3:
890   li=(ulonglong) i*LFACTOR+ (ulonglong) j;
891   *endptr= (char*) s;
892   return (negative ? -((longlong) li) : (longlong) li);
893 
894 end4:
895   li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
896   *endptr= (char*) s;
897   if (negative)
898   {
899    if (li > MAX_NEGATIVE_NUMBER)
900      goto overflow;
901    return -((longlong) li);
902   }
903   return (longlong) li;
904 
905 no_conv:
906   /* There was no number to convert.  */
907   *error= MY_ERRNO_EDOM;
908   *endptr= (char *) nptr;
909   return 0;
910 }
911 
912 
913 static size_t
my_scan_mb2(const CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)914 my_scan_mb2(const CHARSET_INFO *cs,
915             const char *str, const char *end, int sequence_type)
916 {
917   const char *str0= str;
918   my_wc_t wc;
919   int res;
920 
921   switch (sequence_type)
922   {
923   case MY_SEQ_SPACES:
924     for (res= cs->cset->mb_wc(cs, &wc,
925                               (const uchar *) str, (const uchar *) end);
926          res > 0 && wc == ' ';
927          str+= res,
928          res= cs->cset->mb_wc(cs, &wc,
929                               (const uchar *) str, (const uchar *) end))
930     {
931     }
932     return (size_t) (str - str0);
933   default:
934     return 0;
935   }
936 }
937 
938 
939 static void
my_fill_mb2(const CHARSET_INFO * cs,char * s,size_t slen,int fill)940 my_fill_mb2(const CHARSET_INFO *cs, char *s, size_t slen, int fill)
941 {
942   char buf[10];
943   int buflen;
944 
945   DBUG_ASSERT((slen % 2) == 0);
946 
947   buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
948                           (uchar*) buf + sizeof(buf));
949 
950   DBUG_ASSERT(buflen > 0);
951 
952   while (slen >= (size_t) buflen)
953   {
954     /* Enough space for the characer */
955     memcpy(s, buf, (size_t) buflen);
956     s+= buflen;
957     slen-= buflen;
958   }
959 
960   /*
961     If there are some more space which is not enough
962     for the whole multibyte character, then add trailing zeros.
963   */
964   for ( ; slen; slen--)
965   {
966     *s++= 0x00;
967   }
968 }
969 
970 
971 static int
my_vsnprintf_mb2(char * dst,size_t n,const char * fmt,va_list ap)972 my_vsnprintf_mb2(char *dst, size_t n, const char* fmt, va_list ap)
973 {
974   char *start=dst, *end= dst + n - 1;
975   for (; *fmt ; fmt++)
976   {
977     if (fmt[0] != '%')
978     {
979       if (dst == end)                     /* End of buffer */
980         break;
981 
982       *dst++='\0';
983       *dst++= *fmt;          /* Copy ordinary char */
984       continue;
985     }
986 
987     fmt++;
988 
989     /* Skip if max size is used (to be compatible with printf) */
990     while ( (*fmt >= '0' && *fmt <= '9') || *fmt == '.' || *fmt == '-')
991       fmt++;
992 
993     if (*fmt == 'l')
994       fmt++;
995 
996     if (*fmt == 's')                      /* String parameter */
997     {
998       char *par= va_arg(ap, char *);
999       size_t plen;
1000       size_t left_len= (size_t)(end-dst);
1001       if (!par)
1002         par= (char*) "(null)";
1003       plen= strlen(par);
1004       if (left_len <= plen * 2)
1005         plen = left_len / 2 - 1;
1006 
1007       for ( ; plen ; plen--, dst+=2, par++)
1008       {
1009         dst[0]= '\0';
1010         dst[1]= par[0];
1011       }
1012       continue;
1013     }
1014     else if (*fmt == 'd' || *fmt == 'u')  /* Integer parameter */
1015     {
1016       int iarg;
1017       char nbuf[16];
1018       char *pbuf= nbuf;
1019 
1020       if ((size_t) (end - dst) < 32)
1021         break;
1022       iarg= va_arg(ap, int);
1023       if (*fmt == 'd')
1024         int10_to_str((long) iarg, nbuf, -10);
1025       else
1026         int10_to_str((long) (uint) iarg, nbuf,10);
1027 
1028       for (; pbuf[0]; pbuf++)
1029       {
1030         *dst++= '\0';
1031         *dst++= *pbuf;
1032       }
1033       continue;
1034     }
1035 
1036     /* We come here on '%%', unknown code or too long parameter */
1037     if (dst == end)
1038       break;
1039     *dst++= '\0';
1040     *dst++= '%';                            /* % used as % or unknown code */
1041   }
1042 
1043   DBUG_ASSERT(dst <= end);
1044   *dst='\0';                                /* End of errmessage */
1045   return (size_t) (dst - start);
1046 }
1047 
1048 
1049 static size_t
my_snprintf_mb2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * to,size_t n,const char * fmt,...)1050 my_snprintf_mb2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1051                 char* to, size_t n, const char* fmt, ...)
1052 {
1053   size_t retval;
1054   va_list args;
1055   va_start(args,fmt);
1056   retval= my_vsnprintf_mb2(to, n, fmt, args);
1057   va_end(args);
1058   return retval;
1059 }
1060 
1061 
1062 static size_t
my_lengthsp_mb2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)1063 my_lengthsp_mb2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1064                 const char *ptr, size_t length)
1065 {
1066   const char *end= ptr + length;
1067   while (end > ptr + 1 && end[-1] == ' ' && end[-2] == '\0')
1068     end-= 2;
1069   return (size_t) (end - ptr);
1070 }
1071 
1072 #endif /* HAVE_CHARSET_mb2*/
1073 
1074 
1075 
1076 
1077 #ifdef HAVE_CHARSET_utf16
1078 
1079 /*
1080   D800..DB7F - Non-provate surrogate high (896 pages)
1081   DB80..DBFF - Private surrogate high     (128 pages)
1082   DC00..DFFF - Surrogate low              (1024 codes in a page)
1083 */
1084 #define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
1085 #define MY_UTF16_SURROGATE_HIGH_LAST  0xDBFF
1086 #define MY_UTF16_SURROGATE_LOW_FIRST  0xDC00
1087 #define MY_UTF16_SURROGATE_LOW_LAST   0xDFFF
1088 
1089 #define MY_UTF16_HIGH_HEAD(x)  ((((uchar) (x)) & 0xFC) == 0xD8)
1090 #define MY_UTF16_LOW_HEAD(x)   ((((uchar) (x)) & 0xFC) == 0xDC)
1091 #define MY_UTF16_SURROGATE(x)  (((x) & 0xF800) == 0xD800)
1092 
1093 #define MY_UTF16_WC2(a, b)       ((a << 8) + b)
1094 
1095 /*
1096   a= 110110??  (<< 18)
1097   b= ????????  (<< 10)
1098   c= 110111??  (<<  8)
1099   d= ????????  (<<  0)
1100 */
1101 #define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
1102                                   ((c & 3) << 8) + d + 0x10000)
1103 
1104 static int
my_utf16_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1105 my_utf16_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1106              my_wc_t *pwc, const uchar *s, const uchar *e)
1107 {
1108   if (s + 2 > e)
1109     return MY_CS_TOOSMALL2;
1110 
1111   /*
1112     High bytes: 0xD[89AB] = B'110110??'
1113     Low bytes:  0xD[CDEF] = B'110111??'
1114     Surrogate mask:  0xFC = B'11111100'
1115   */
1116 
1117   if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
1118   {
1119     if (s + 4 > e)
1120       return MY_CS_TOOSMALL4;
1121 
1122     if (!MY_UTF16_LOW_HEAD(s[2]))  /* Broken surrigate pair */
1123       return MY_CS_ILSEQ;
1124 
1125     *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
1126     return 4;
1127   }
1128 
1129   if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
1130     return MY_CS_ILSEQ;
1131 
1132   *pwc= MY_UTF16_WC2(s[0], s[1]);
1133   return 2;
1134 }
1135 
1136 
1137 static int
my_uni_utf16(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1138 my_uni_utf16(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1139              my_wc_t wc, uchar *s, uchar *e)
1140 {
1141   if (wc <= 0xFFFF)
1142   {
1143     if (s + 2 > e)
1144       return MY_CS_TOOSMALL2;
1145     if (MY_UTF16_SURROGATE(wc))
1146       return MY_CS_ILUNI;
1147     *s++= (uchar) (wc >> 8);
1148     *s= (uchar) (wc & 0xFF);
1149     return 2;
1150   }
1151 
1152   if (wc <= 0x10FFFF)
1153   {
1154     if (s + 4 > e)
1155       return MY_CS_TOOSMALL4;
1156     *s++= (uchar) ((wc-= 0x10000) >> 18) | 0xD8;
1157     *s++= (uchar) (wc >> 10) & 0xFF;
1158     *s++= (uchar) ((wc >> 8) & 3) | 0xDC;
1159     *s= (uchar) wc & 0xFF;
1160     return 4;
1161   }
1162 
1163   return MY_CS_ILUNI;
1164 }
1165 
1166 
1167 static inline void
my_tolower_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1168 my_tolower_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1169 {
1170   MY_UNICASE_CHARACTER *page;
1171   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1172     *wc= page[*wc & 0xFF].tolower;
1173 }
1174 
1175 
1176 static inline void
my_toupper_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1177 my_toupper_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1178 {
1179   MY_UNICASE_CHARACTER *page;
1180   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1181     *wc= page[*wc & 0xFF].toupper;
1182 }
1183 
1184 
1185 static inline void
my_tosort_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1186 my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1187 {
1188   if (*wc <= uni_plane->maxchar)
1189   {
1190     MY_UNICASE_CHARACTER *page;
1191     if ((page= uni_plane->page[*wc >> 8]))
1192       *wc= page[*wc & 0xFF].sort;
1193   }
1194   else
1195   {
1196     *wc= MY_CS_REPLACEMENT_CHARACTER;
1197   }
1198 }
1199 
1200 
1201 
1202 static size_t
my_caseup_utf16(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1203 my_caseup_utf16(const CHARSET_INFO *cs, char *src, size_t srclen,
1204                 char *dst MY_ATTRIBUTE((unused)),
1205                 size_t dstlen MY_ATTRIBUTE((unused)))
1206 {
1207   my_wc_t wc;
1208   int res;
1209   char *srcend= src + srclen;
1210   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1211   DBUG_ASSERT(src == dst && srclen == dstlen);
1212 
1213   while ((src < srcend) &&
1214          (res= cs->cset->mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1215   {
1216     my_toupper_utf16(uni_plane, &wc);
1217     if (res != cs->cset->wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
1218       break;
1219     src+= res;
1220   }
1221   return srclen;
1222 }
1223 
1224 
1225 static void
my_hash_sort_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)1226 my_hash_sort_utf16(const CHARSET_INFO *cs, const uchar *s, size_t slen,
1227                    ulong *n1, ulong *n2)
1228 {
1229   my_wc_t wc;
1230   int res;
1231   const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen);
1232   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1233 
1234   while ((s < e) && (res= cs->cset->mb_wc(cs, &wc,
1235                                           (uchar *) s, (uchar *) e)) > 0)
1236   {
1237     my_tosort_utf16(uni_plane, &wc);
1238     n1[0]^= (((n1[0] & 63) + n2[0]) * (wc & 0xFF)) + (n1[0] << 8);
1239     n2[0]+= 3;
1240     n1[0]^= (((n1[0] & 63) + n2[0]) * (wc >> 8)) + (n1[0] << 8);
1241     n2[0]+= 3;
1242     s+= res;
1243   }
1244 }
1245 
1246 
1247 static size_t
my_casedn_utf16(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1248 my_casedn_utf16(const CHARSET_INFO *cs, char *src, size_t srclen,
1249                 char *dst MY_ATTRIBUTE((unused)),
1250                 size_t dstlen MY_ATTRIBUTE((unused)))
1251 {
1252   my_wc_t wc;
1253   int res;
1254   char *srcend= src + srclen;
1255   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1256   DBUG_ASSERT(src == dst && srclen == dstlen);
1257 
1258   while ((src < srcend) &&
1259          (res= cs->cset->mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1260   {
1261     my_tolower_utf16(uni_plane, &wc);
1262     if (res != cs->cset->wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
1263       break;
1264     src+= res;
1265   }
1266   return srclen;
1267 }
1268 
1269 
1270 static int
my_strnncoll_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)1271 my_strnncoll_utf16(const CHARSET_INFO *cs,
1272                    const uchar *s, size_t slen,
1273                    const uchar *t, size_t tlen,
1274                    my_bool t_is_prefix)
1275 {
1276   int s_res, t_res;
1277   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
1278   const uchar *se= s + slen;
1279   const uchar *te= t + tlen;
1280   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1281 
1282   while (s < se && t < te)
1283   {
1284     s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1285     t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1286 
1287     if (s_res <= 0 || t_res <= 0)
1288     {
1289       /* Incorrect string, compare by char value */
1290       return my_bincmp(s, se, t, te);
1291     }
1292 
1293     my_tosort_utf16(uni_plane, &s_wc);
1294     my_tosort_utf16(uni_plane, &t_wc);
1295 
1296     if (s_wc != t_wc)
1297     {
1298       return  s_wc > t_wc ? 1 : -1;
1299     }
1300 
1301     s+= s_res;
1302     t+= t_res;
1303   }
1304   return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
1305 }
1306 
1307 
1308 /**
1309   Compare strings, discarding end space
1310 
1311   If one string is shorter as the other, then we space extend the other
1312   so that the strings have equal length.
1313 
1314   This will ensure that the following things hold:
1315 
1316     "a"  == "a "
1317     "a\0" < "a"
1318     "a\0" < "a "
1319 
1320   @param  cs        Character set pinter.
1321   @param  a         First string to compare.
1322   @param  a_length  Length of 'a'.
1323   @param  b         Second string to compare.
1324   @param  b_length  Length of 'b'.
1325 
1326   IMPLEMENTATION
1327 
1328   @return Comparison result.
1329     @retval Negative number, if a less than b.
1330     @retval 0, if a is equal to b
1331     @retval Positive number, if a > b
1332 */
1333 
1334 static int
my_strnncollsp_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)1335 my_strnncollsp_utf16(const CHARSET_INFO *cs,
1336                      const uchar *s, size_t slen,
1337                      const uchar *t, size_t tlen,
1338                      my_bool diff_if_only_endspace_difference)
1339 {
1340   int res;
1341   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
1342   const uchar *se= s + slen, *te= t + tlen;
1343   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1344 
1345   DBUG_ASSERT((slen % 2) == 0);
1346   DBUG_ASSERT((tlen % 2) == 0);
1347 
1348 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
1349   diff_if_only_endspace_difference= FALSE;
1350 #endif
1351 
1352   while (s < se && t < te)
1353   {
1354     int s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1355     int t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1356 
1357     if (s_res <= 0 || t_res <= 0)
1358     {
1359       /* Incorrect string, compare bytewise */
1360       return my_bincmp(s, se, t, te);
1361     }
1362 
1363     my_tosort_utf16(uni_plane, &s_wc);
1364     my_tosort_utf16(uni_plane, &t_wc);
1365 
1366     if (s_wc != t_wc)
1367     {
1368       return s_wc > t_wc ? 1 : -1;
1369     }
1370 
1371     s+= s_res;
1372     t+= t_res;
1373   }
1374 
1375   slen= (size_t) (se - s);
1376   tlen= (size_t) (te - t);
1377   res= 0;
1378 
1379   if (slen != tlen)
1380   {
1381     int s_res, swap= 1;
1382     if (diff_if_only_endspace_difference)
1383       res= 1;                                   /* Assume 's' is bigger */
1384     if (slen < tlen)
1385     {
1386       slen= tlen;
1387       s= t;
1388       se= te;
1389       swap= -1;
1390       res= -res;
1391     }
1392 
1393     for ( ; s < se; s+= s_res)
1394     {
1395       if ((s_res= cs->cset->mb_wc(cs, &s_wc, s, se)) < 0)
1396       {
1397         DBUG_ASSERT(0);
1398         return 0;
1399       }
1400       if (s_wc != ' ')
1401         return (s_wc < ' ') ? -swap : swap;
1402     }
1403   }
1404   return res;
1405 }
1406 
1407 
1408 static uint
my_ismbchar_utf16(const CHARSET_INFO * cs,const char * b,const char * e)1409 my_ismbchar_utf16(const CHARSET_INFO *cs, const char *b, const char *e)
1410 {
1411   my_wc_t wc;
1412   int res= cs->cset->mb_wc(cs, &wc, (const uchar *) b, (const uchar *) e);
1413   return (uint) (res > 0 ? res : 0);
1414 }
1415 
1416 
1417 static uint
my_mbcharlen_utf16(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))1418 my_mbcharlen_utf16(const CHARSET_INFO *cs  MY_ATTRIBUTE((unused)),
1419                    uint c MY_ATTRIBUTE((unused)))
1420 {
1421   DBUG_ASSERT(0);
1422   return MY_UTF16_HIGH_HEAD(c) ? 4 : 2;
1423 }
1424 
1425 
1426 static size_t
my_numchars_utf16(const CHARSET_INFO * cs,const char * b,const char * e)1427 my_numchars_utf16(const CHARSET_INFO *cs,
1428                   const char *b, const char *e)
1429 {
1430   size_t nchars= 0;
1431   for ( ; ; nchars++)
1432   {
1433     size_t charlen= my_ismbchar_utf16(cs, b, e);
1434     if (!charlen)
1435       break;
1436     b+= charlen;
1437   }
1438   return nchars;
1439 }
1440 
1441 
1442 static size_t
my_charpos_utf16(const CHARSET_INFO * cs,const char * b,const char * e,size_t pos)1443 my_charpos_utf16(const CHARSET_INFO *cs,
1444                  const char *b, const char *e, size_t pos)
1445 {
1446   const char *b0= b;
1447   uint charlen;
1448 
1449   for ( ; pos; b+= charlen, pos--)
1450   {
1451     if (!(charlen= my_ismbchar(cs, b, e)))
1452       return (e + 2 - b0); /* Error, return pos outside the string */
1453   }
1454   return (size_t) (pos ? (e + 2 - b0) : (b - b0));
1455 }
1456 
1457 
1458 static size_t
my_well_formed_len_utf16(const CHARSET_INFO * cs,const char * b,const char * e,size_t nchars,int * error)1459 my_well_formed_len_utf16(const CHARSET_INFO *cs,
1460                          const char *b, const char *e,
1461                          size_t nchars, int *error)
1462 {
1463   const char *b0= b;
1464   uint charlen;
1465   *error= 0;
1466 
1467   for ( ; nchars; b+= charlen, nchars--)
1468   {
1469     if (!(charlen= my_ismbchar(cs, b, e)))
1470     {
1471       *error= b < e ? 1 : 0;
1472       break;
1473     }
1474   }
1475   return (size_t) (b - b0);
1476 }
1477 
1478 
1479 static int
my_wildcmp_utf16_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1480 my_wildcmp_utf16_ci(const CHARSET_INFO *cs,
1481                     const char *str,const char *str_end,
1482                     const char *wildstr,const char *wildend,
1483                     int escape, int w_one, int w_many)
1484 {
1485   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1486   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1487                             escape, w_one, w_many, uni_plane);
1488 }
1489 
1490 
1491 static int
my_wildcmp_utf16_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1492 my_wildcmp_utf16_bin(const CHARSET_INFO *cs,
1493                      const char *str,const char *str_end,
1494                      const char *wildstr,const char *wildend,
1495                      int escape, int w_one, int w_many)
1496 {
1497   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1498                             escape, w_one, w_many, NULL);
1499 }
1500 
1501 
1502 static int
my_strnncoll_utf16_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)1503 my_strnncoll_utf16_bin(const CHARSET_INFO *cs,
1504                        const uchar *s, size_t slen,
1505                        const uchar *t, size_t tlen,
1506                        my_bool t_is_prefix)
1507 {
1508   int s_res,t_res;
1509   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
1510   const uchar *se=s+slen;
1511   const uchar *te=t+tlen;
1512 
1513   while ( s < se && t < te )
1514   {
1515     s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1516     t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1517 
1518     if (s_res <= 0 || t_res <= 0)
1519     {
1520       /* Incorrect string, compare by char value */
1521       return my_bincmp(s, se, t, te);
1522     }
1523     if (s_wc != t_wc)
1524     {
1525       return s_wc > t_wc ? 1 : -1;
1526     }
1527 
1528     s+= s_res;
1529     t+= t_res;
1530   }
1531   return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
1532 }
1533 
1534 
1535 static int
my_strnncollsp_utf16_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)1536 my_strnncollsp_utf16_bin(const CHARSET_INFO *cs,
1537                          const uchar *s, size_t slen,
1538                          const uchar *t, size_t tlen,
1539                          my_bool diff_if_only_endspace_difference)
1540 {
1541   int res;
1542   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
1543   const uchar *se= s + slen, *te= t + tlen;
1544 
1545   DBUG_ASSERT((slen % 2) == 0);
1546   DBUG_ASSERT((tlen % 2) == 0);
1547 
1548 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
1549   diff_if_only_endspace_difference= FALSE;
1550 #endif
1551 
1552   while (s < se && t < te)
1553   {
1554     int s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1555     int t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1556 
1557     if (s_res <= 0 || t_res <= 0)
1558     {
1559       /* Incorrect string, compare bytewise */
1560       return my_bincmp(s, se, t, te);
1561     }
1562 
1563     if (s_wc != t_wc)
1564     {
1565       return s_wc > t_wc ? 1 : -1;
1566     }
1567 
1568     s+= s_res;
1569     t+= t_res;
1570   }
1571 
1572   slen= (size_t) (se - s);
1573   tlen= (size_t) (te - t);
1574   res= 0;
1575 
1576   if (slen != tlen)
1577   {
1578     int s_res, swap= 1;
1579     if (diff_if_only_endspace_difference)
1580       res= 1;                                   /* Assume 's' is bigger */
1581     if (slen < tlen)
1582     {
1583       slen= tlen;
1584       s= t;
1585       se= te;
1586       swap= -1;
1587       res= -res;
1588     }
1589 
1590     for ( ; s < se; s+= s_res)
1591     {
1592       if ((s_res= cs->cset->mb_wc(cs, &s_wc, s, se)) < 0)
1593       {
1594         DBUG_ASSERT(0);
1595         return 0;
1596       }
1597       if (s_wc != ' ')
1598         return (s_wc < ' ') ? -swap : swap;
1599     }
1600   }
1601   return res;
1602 }
1603 
1604 
1605 static void
my_hash_sort_utf16_bin(const CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1606 my_hash_sort_utf16_bin(const CHARSET_INFO *cs,
1607                        const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
1608 {
1609   const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len);
1610   for ( ; pos < end ; pos++)
1611   {
1612     nr1[0]^= (ulong) ((((uint) nr1[0] & 63) + nr2[0]) *
1613               ((uint)*pos)) + (nr1[0] << 8);
1614     nr2[0]+= 3;
1615   }
1616 }
1617 
1618 
1619 static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
1620 {
1621   NULL,                /* init */
1622   my_strnncoll_utf16,
1623   my_strnncollsp_utf16,
1624   my_strnxfrm_unicode,
1625   my_strnxfrmlen_simple,
1626   my_like_range_generic,
1627   my_wildcmp_utf16_ci,
1628   my_strcasecmp_mb2_or_mb4,
1629   my_instr_mb,
1630   my_hash_sort_utf16,
1631   my_propagate_simple
1632 };
1633 
1634 
1635 static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
1636 {
1637   NULL,                /* init */
1638   my_strnncoll_utf16_bin,
1639   my_strnncollsp_utf16_bin,
1640   my_strnxfrm_unicode_full_bin,
1641   my_strnxfrmlen_unicode_full_bin,
1642   my_like_range_generic,
1643   my_wildcmp_utf16_bin,
1644   my_strcasecmp_mb2_or_mb4,
1645   my_instr_mb,
1646   my_hash_sort_utf16_bin,
1647   my_propagate_simple
1648 };
1649 
1650 
1651 MY_CHARSET_HANDLER my_charset_utf16_handler=
1652 {
1653   NULL,                /* init         */
1654   my_ismbchar_utf16,   /* ismbchar     */
1655   my_mbcharlen_utf16,  /* mbcharlen    */
1656   my_numchars_utf16,
1657   my_charpos_utf16,
1658   my_well_formed_len_utf16,
1659   my_lengthsp_mb2,
1660   my_numcells_mb,
1661   my_utf16_uni,        /* mb_wc        */
1662   my_uni_utf16,        /* wc_mb        */
1663   my_mb_ctype_mb,
1664   my_caseup_str_mb2_or_mb4,
1665   my_casedn_str_mb2_or_mb4,
1666   my_caseup_utf16,
1667   my_casedn_utf16,
1668   my_snprintf_mb2,
1669   my_l10tostr_mb2_or_mb4,
1670   my_ll10tostr_mb2_or_mb4,
1671   my_fill_mb2,
1672   my_strntol_mb2_or_mb4,
1673   my_strntoul_mb2_or_mb4,
1674   my_strntoll_mb2_or_mb4,
1675   my_strntoull_mb2_or_mb4,
1676   my_strntod_mb2_or_mb4,
1677   my_strtoll10_mb2,
1678   my_strntoull10rnd_mb2_or_mb4,
1679   my_scan_mb2
1680 };
1681 
1682 
1683 CHARSET_INFO my_charset_utf16_general_ci=
1684 {
1685   54,0,0,              /* number       */
1686   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1687   "utf16",             /* cs name    */
1688   "utf16_general_ci",  /* name         */
1689   "UTF-16 Unicode",    /* comment      */
1690   NULL,                /* tailoring    */
1691   NULL,                /* ctype        */
1692   NULL,                /* to_lower     */
1693   NULL,                /* to_upper     */
1694   NULL,                /* sort_order   */
1695   NULL,                /* uca          */
1696   NULL,                /* tab_to_uni   */
1697   NULL,                /* tab_from_uni */
1698   &my_unicase_default, /* caseinfo     */
1699   NULL,                /* state_map    */
1700   NULL,                /* ident_map    */
1701   1,                   /* strxfrm_multiply */
1702   1,                   /* caseup_multiply  */
1703   1,                   /* casedn_multiply  */
1704   2,                   /* mbminlen     */
1705   4,                   /* mbmaxlen     */
1706   0,                   /* min_sort_char */
1707   0xFFFF,              /* max_sort_char */
1708   ' ',                 /* pad char      */
1709   0,                   /* escape_with_backslash_is_dangerous */
1710   1,                   /* levels_for_compare */
1711   1,                   /* levels_for_order   */
1712   &my_charset_utf16_handler,
1713   &my_collation_utf16_general_ci_handler
1714 };
1715 
1716 
1717 CHARSET_INFO my_charset_utf16_bin=
1718 {
1719   55,0,0,              /* number       */
1720   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1721   "utf16",             /* cs name      */
1722   "utf16_bin",         /* name         */
1723   "UTF-16 Unicode",    /* comment      */
1724   NULL,                /* tailoring    */
1725   NULL,                /* ctype        */
1726   NULL,                /* to_lower     */
1727   NULL,                /* to_upper     */
1728   NULL,                /* sort_order   */
1729   NULL,                /* uca          */
1730   NULL,                /* tab_to_uni   */
1731   NULL,                /* tab_from_uni */
1732   &my_unicase_default, /* caseinfo     */
1733   NULL,                /* state_map    */
1734   NULL,                /* ident_map    */
1735   1,                   /* strxfrm_multiply */
1736   1,                   /* caseup_multiply  */
1737   1,                   /* casedn_multiply  */
1738   2,                   /* mbminlen     */
1739   4,                   /* mbmaxlen     */
1740   0,                   /* min_sort_char */
1741   0xFFFF,              /* max_sort_char */
1742   ' ',                 /* pad char      */
1743   0,                   /* escape_with_backslash_is_dangerous */
1744   1,                   /* levels_for_compare */
1745   1,                   /* levels_for_order   */
1746   &my_charset_utf16_handler,
1747   &my_collation_utf16_bin_handler
1748 };
1749 
1750 
1751 static int
my_utf16le_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1752 my_utf16le_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1753                my_wc_t *pwc, const uchar *s, const uchar *e)
1754 {
1755   my_wc_t lo;
1756 
1757   if (s + 2 > e)
1758     return MY_CS_TOOSMALL2;
1759 
1760   if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
1761       (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
1762     return 2; /* [0000-D7FF,E000-FFFF] */
1763 
1764   if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
1765     return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
1766 
1767   if (s + 4  > e)
1768     return MY_CS_TOOSMALL4;
1769 
1770   s+= 2;
1771 
1772   if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
1773       lo > MY_UTF16_SURROGATE_LOW_LAST)
1774     return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
1775 
1776   *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
1777   return 4;
1778 }
1779 
1780 
1781 static int
my_uni_utf16le(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1782 my_uni_utf16le(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1783                my_wc_t wc, uchar *s, uchar *e)
1784 {
1785   if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
1786       (wc > MY_UTF16_SURROGATE_LOW_LAST &&
1787        wc <= 0xFFFF))
1788   {
1789     if (s + 2 > e)
1790       return MY_CS_TOOSMALL2;
1791     int2store(s, wc);
1792     return 2; /* [0000-D7FF,E000-FFFF] */
1793   }
1794 
1795   if (wc < 0xFFFF || wc > 0x10FFFF)
1796     return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
1797 
1798   if (s + 4 > e)
1799     return MY_CS_TOOSMALL4;
1800 
1801   wc-= 0x10000;
1802   int2store(s,     (0xD800 | ((wc >> 10) & 0x3FF))); s+= 2;
1803   int2store(s,     (0xDC00 | (wc & 0x3FF)));
1804   return 4; /* [010000-10FFFF] */
1805 }
1806 
1807 
1808 static size_t
my_lengthsp_utf16le(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)1809 my_lengthsp_utf16le(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1810                     const char *ptr, size_t length)
1811 {
1812   const char *end= ptr + length;
1813   while (end > ptr + 1 && uint2korr(end - 2) == 0x20)
1814     end-= 2;
1815   return (size_t) (end - ptr);
1816 }
1817 
1818 
1819 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
1820 {
1821   NULL,                /* init         */
1822   my_ismbchar_utf16,
1823   my_mbcharlen_utf16,
1824   my_numchars_utf16,
1825   my_charpos_utf16,
1826   my_well_formed_len_utf16,
1827   my_lengthsp_utf16le,
1828   my_numcells_mb,
1829   my_utf16le_uni,      /* mb_wc        */
1830   my_uni_utf16le,      /* wc_mb        */
1831   my_mb_ctype_mb,
1832   my_caseup_str_mb2_or_mb4,
1833   my_casedn_str_mb2_or_mb4,
1834   my_caseup_utf16,
1835   my_casedn_utf16,
1836   my_snprintf_mb2,
1837   my_l10tostr_mb2_or_mb4,
1838   my_ll10tostr_mb2_or_mb4,
1839   my_fill_mb2,
1840   my_strntol_mb2_or_mb4,
1841   my_strntoul_mb2_or_mb4,
1842   my_strntoll_mb2_or_mb4,
1843   my_strntoull_mb2_or_mb4,
1844   my_strntod_mb2_or_mb4,
1845   my_strtoll10_mb2,
1846   my_strntoull10rnd_mb2_or_mb4,
1847   my_scan_mb2
1848 };
1849 
1850 
1851 CHARSET_INFO my_charset_utf16le_general_ci=
1852 {
1853   56,0,0,              /* number       */
1854   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1855   "utf16le",           /* cs name    */
1856   "utf16le_general_ci",/* name         */
1857   "UTF-16LE Unicode",  /* comment      */
1858   NULL,                /* tailoring    */
1859   NULL,                /* ctype        */
1860   NULL,                /* to_lower     */
1861   NULL,                /* to_upper     */
1862   NULL,                /* sort_order   */
1863   NULL,                /* uca          */
1864   NULL,                /* tab_to_uni   */
1865   NULL,                /* tab_from_uni */
1866   &my_unicase_default, /* caseinfo     */
1867   NULL,                /* state_map    */
1868   NULL,                /* ident_map    */
1869   1,                   /* strxfrm_multiply */
1870   1,                   /* caseup_multiply  */
1871   1,                   /* casedn_multiply  */
1872   2,                   /* mbminlen     */
1873   4,                   /* mbmaxlen     */
1874   0,                   /* min_sort_char */
1875   0xFFFF,              /* max_sort_char */
1876   ' ',                 /* pad char      */
1877   0,                   /* escape_with_backslash_is_dangerous */
1878   1,                   /* levels_for_compare */
1879   1,                   /* levels_for_order   */
1880   &my_charset_utf16le_handler,
1881   &my_collation_utf16_general_ci_handler
1882 };
1883 
1884 
1885 CHARSET_INFO my_charset_utf16le_bin=
1886 {
1887   62,0,0,              /* number       */
1888   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1889   "utf16le",           /* cs name      */
1890   "utf16le_bin",       /* name         */
1891   "UTF-16LE Unicode",  /* comment      */
1892   NULL,                /* tailoring    */
1893   NULL,                /* ctype        */
1894   NULL,                /* to_lower     */
1895   NULL,                /* to_upper     */
1896   NULL,                /* sort_order   */
1897   NULL,                /* uca          */
1898   NULL,                /* tab_to_uni   */
1899   NULL,                /* tab_from_uni */
1900   &my_unicase_default, /* caseinfo     */
1901   NULL,                /* state_map    */
1902   NULL,                /* ident_map    */
1903   1,                   /* strxfrm_multiply */
1904   1,                   /* caseup_multiply  */
1905   1,                   /* casedn_multiply  */
1906   2,                   /* mbminlen     */
1907   4,                   /* mbmaxlen     */
1908   0,                   /* min_sort_char */
1909   0xFFFF,              /* max_sort_char */
1910   ' ',                 /* pad char      */
1911   0,                   /* escape_with_backslash_is_dangerous */
1912   1,                   /* levels_for_compare */
1913   1,                   /* levels_for_order   */
1914   &my_charset_utf16le_handler,
1915   &my_collation_utf16_bin_handler
1916 };
1917 
1918 
1919 #endif /* HAVE_CHARSET_utf16 */
1920 
1921 
1922 #ifdef HAVE_CHARSET_utf32
1923 
1924 static int
my_utf32_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1925 my_utf32_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1926              my_wc_t *pwc, const uchar *s, const uchar *e)
1927 {
1928   if (s + 4 > e)
1929     return MY_CS_TOOSMALL4;
1930   *pwc= (((my_wc_t)s[0]) << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
1931   return 4;
1932 }
1933 
1934 
1935 static int
my_uni_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1936 my_uni_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1937              my_wc_t wc, uchar *s, uchar *e)
1938 {
1939   if (s + 4 > e)
1940     return MY_CS_TOOSMALL4;
1941 
1942   s[0]= (uchar) (wc >> 24);
1943   s[1]= (uchar) (wc >> 16) & 0xFF;
1944   s[2]= (uchar) (wc >> 8)  & 0xFF;
1945   s[3]= (uchar) wc & 0xFF;
1946   return 4;
1947 }
1948 
1949 
1950 static inline void
my_tolower_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1951 my_tolower_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1952 {
1953   MY_UNICASE_CHARACTER *page;
1954   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1955     *wc= page[*wc & 0xFF].tolower;
1956 }
1957 
1958 
1959 static inline void
my_toupper_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1960 my_toupper_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1961 {
1962   MY_UNICASE_CHARACTER *page;
1963   if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1964     *wc= page[*wc & 0xFF].toupper;
1965 }
1966 
1967 
1968 static inline void
my_tosort_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1969 my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1970 {
1971   if (*wc <= uni_plane->maxchar)
1972   {
1973     MY_UNICASE_CHARACTER *page;
1974     if ((page= uni_plane->page[*wc >> 8]))
1975       *wc= page[*wc & 0xFF].sort;
1976   }
1977   else
1978   {
1979     *wc= MY_CS_REPLACEMENT_CHARACTER;
1980   }
1981 }
1982 
1983 
1984 static size_t
my_caseup_utf32(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1985 my_caseup_utf32(const CHARSET_INFO *cs, char *src, size_t srclen,
1986                 char *dst MY_ATTRIBUTE((unused)),
1987                 size_t dstlen MY_ATTRIBUTE((unused)))
1988 {
1989   my_wc_t wc;
1990   int res;
1991   char *srcend= src + srclen;
1992   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1993   DBUG_ASSERT(src == dst && srclen == dstlen);
1994 
1995   while ((src < srcend) &&
1996          (res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
1997   {
1998     my_toupper_utf32(uni_plane, &wc);
1999     if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
2000       break;
2001     src+= res;
2002   }
2003   return srclen;
2004 }
2005 
2006 
2007 static inline void
my_hash_add(ulong * n1,ulong * n2,uint ch)2008 my_hash_add(ulong *n1, ulong *n2, uint ch)
2009 {
2010   n1[0]^= (((n1[0] & 63) + n2[0]) * (ch)) + (n1[0] << 8);
2011   n2[0]+= 3;
2012 }
2013 
2014 
2015 static void
my_hash_sort_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)2016 my_hash_sort_utf32(const CHARSET_INFO *cs, const uchar *s, size_t slen,
2017                    ulong *n1, ulong *n2)
2018 {
2019   my_wc_t wc;
2020   int res;
2021   const uchar *e= s + slen;
2022   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2023 
2024   /* Skip trailing spaces */
2025   while (e > s + 3 && e[-1] == ' ' && !e[-2] && !e[-3] && !e[-4])
2026     e-= 4;
2027 
2028   while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
2029   {
2030     my_tosort_utf32(uni_plane, &wc);
2031     my_hash_add(n1, n2, (uint) (wc >> 24));
2032     my_hash_add(n1, n2, (uint) (wc >> 16) & 0xFF);
2033     my_hash_add(n1, n2, (uint) (wc >> 8)  & 0xFF);
2034     my_hash_add(n1, n2, (uint) (wc & 0xFF));
2035     s+= res;
2036   }
2037 }
2038 
2039 
2040 static size_t
my_casedn_utf32(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2041 my_casedn_utf32(const CHARSET_INFO *cs, char *src, size_t srclen,
2042                 char *dst MY_ATTRIBUTE((unused)),
2043                 size_t dstlen MY_ATTRIBUTE((unused)))
2044 {
2045   my_wc_t wc;
2046   int res;
2047   char *srcend= src + srclen;
2048   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2049   DBUG_ASSERT(src == dst && srclen == dstlen);
2050 
2051   while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
2052   {
2053     my_tolower_utf32(uni_plane,&wc);
2054     if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
2055       break;
2056     src+= res;
2057   }
2058   return srclen;
2059 }
2060 
2061 
2062 static int
my_strnncoll_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)2063 my_strnncoll_utf32(const CHARSET_INFO *cs,
2064                    const uchar *s, size_t slen,
2065                    const uchar *t, size_t tlen,
2066                    my_bool t_is_prefix)
2067 {
2068   my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
2069   const uchar *se= s + slen;
2070   const uchar *te= t + tlen;
2071   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2072 
2073   while (s < se && t < te)
2074   {
2075     int s_res= my_utf32_uni(cs, &s_wc, s, se);
2076     int t_res= my_utf32_uni(cs, &t_wc, t, te);
2077 
2078     if ( s_res <= 0 || t_res <= 0)
2079     {
2080       /* Incorrect string, compare by char value */
2081       return my_bincmp(s, se, t, te);
2082     }
2083 
2084     my_tosort_utf32(uni_plane, &s_wc);
2085     my_tosort_utf32(uni_plane, &t_wc);
2086 
2087     if (s_wc != t_wc)
2088     {
2089       return s_wc > t_wc ? 1 : -1;
2090     }
2091 
2092     s+= s_res;
2093     t+= t_res;
2094   }
2095   return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
2096 }
2097 
2098 
2099 /**
2100   Compare strings, discarding end space
2101 
2102   If one string is shorter as the other, then we space extend the other
2103   so that the strings have equal length.
2104 
2105   This will ensure that the following things hold:
2106 
2107     "a"  == "a "
2108     "a\0" < "a"
2109     "a\0" < "a "
2110 
2111   @param  cs        Character set pinter.
2112   @param  a         First string to compare.
2113   @param  a_length  Length of 'a'.
2114   @param  b         Second string to compare.
2115   @param  b_length  Length of 'b'.
2116 
2117   IMPLEMENTATION
2118 
2119   @return Comparison result.
2120     @retval Negative number, if a less than b.
2121     @retval 0, if a is equal to b
2122     @retval Positive number, if a > b
2123 */
2124 
2125 
2126 static int
my_strnncollsp_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)2127 my_strnncollsp_utf32(const CHARSET_INFO *cs,
2128                      const uchar *s, size_t slen,
2129                      const uchar *t, size_t tlen,
2130                      my_bool diff_if_only_endspace_difference)
2131 {
2132   int res;
2133   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
2134   const uchar *se= s + slen, *te= t + tlen;
2135   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2136 
2137   DBUG_ASSERT((slen % 4) == 0);
2138   DBUG_ASSERT((tlen % 4) == 0);
2139 
2140 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
2141   diff_if_only_endspace_difference= FALSE;
2142 #endif
2143 
2144   while ( s < se && t < te )
2145   {
2146     int s_res= my_utf32_uni(cs, &s_wc, s, se);
2147     int t_res= my_utf32_uni(cs, &t_wc, t, te);
2148 
2149     if ( s_res <= 0 || t_res <= 0 )
2150     {
2151       /* Incorrect string, compare bytewise */
2152       return my_bincmp(s, se, t, te);
2153     }
2154 
2155     my_tosort_utf32(uni_plane, &s_wc);
2156     my_tosort_utf32(uni_plane, &t_wc);
2157 
2158     if ( s_wc != t_wc )
2159     {
2160       return s_wc > t_wc ? 1 : -1;
2161     }
2162 
2163     s+= s_res;
2164     t+= t_res;
2165   }
2166 
2167   slen= (size_t) (se - s);
2168   tlen= (size_t) (te - t);
2169   res= 0;
2170 
2171   if (slen != tlen)
2172   {
2173     int s_res, swap= 1;
2174     if (diff_if_only_endspace_difference)
2175       res= 1;                                   /* Assume 's' is bigger */
2176     if (slen < tlen)
2177     {
2178       slen= tlen;
2179       s= t;
2180       se= te;
2181       swap= -1;
2182       res= -res;
2183     }
2184 
2185     for ( ; s < se; s+= s_res)
2186     {
2187       if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0)
2188       {
2189         DBUG_ASSERT(0);
2190         return 0;
2191       }
2192       if (s_wc != ' ')
2193         return (s_wc < ' ') ? -swap : swap;
2194     }
2195   }
2196   return res;
2197 }
2198 
2199 
2200 static size_t
my_strnxfrmlen_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),size_t len)2201 my_strnxfrmlen_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2202                      size_t len)
2203 {
2204   return len / 2;
2205 }
2206 
2207 
2208 static uint
my_ismbchar_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)))2209 my_ismbchar_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2210                   const char *b MY_ATTRIBUTE((unused)),
2211                   const char *e MY_ATTRIBUTE((unused)))
2212 {
2213   return 4;
2214 }
2215 
2216 
2217 static uint
my_mbcharlen_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))2218 my_mbcharlen_utf32(const CHARSET_INFO *cs  MY_ATTRIBUTE((unused)) ,
2219                    uint c MY_ATTRIBUTE((unused)))
2220 {
2221   return 4;
2222 }
2223 
2224 
2225 static int
my_vsnprintf_utf32(char * dst,size_t n,const char * fmt,va_list ap)2226 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
2227 {
2228   char *start= dst, *end= dst + n;
2229   DBUG_ASSERT((n % 4) == 0);
2230   for (; *fmt ; fmt++)
2231   {
2232     if (fmt[0] != '%')
2233     {
2234       if (dst >= end)                        /* End of buffer */
2235         break;
2236 
2237       *dst++= '\0';
2238       *dst++= '\0';
2239       *dst++= '\0';
2240       *dst++= *fmt;        /* Copy ordinary char */
2241       continue;
2242     }
2243 
2244     fmt++;
2245 
2246     /* Skip if max size is used (to be compatible with printf) */
2247     while ( (*fmt>='0' && *fmt<='9') || *fmt == '.' || *fmt == '-')
2248       fmt++;
2249 
2250     if (*fmt == 'l')
2251       fmt++;
2252 
2253     if (*fmt == 's')                                /* String parameter */
2254     {
2255       char *par= va_arg(ap, char *);
2256       size_t plen;
2257       size_t left_len= (size_t)(end - dst);
2258       if (!par) par= (char*)"(null)";
2259       plen= strlen(par);
2260       if (left_len <= plen*4)
2261         plen= left_len / 4 - 1;
2262 
2263       for ( ; plen ; plen--, dst+= 4, par++)
2264       {
2265         dst[0]= '\0';
2266         dst[1]= '\0';
2267         dst[2]= '\0';
2268         dst[3]= par[0];
2269       }
2270       continue;
2271     }
2272     else if (*fmt == 'd' || *fmt == 'u')        /* Integer parameter */
2273     {
2274       int iarg;
2275       char nbuf[16];
2276       char *pbuf= nbuf;
2277 
2278       if ((size_t) (end - dst) < 64)
2279         break;
2280       iarg= va_arg(ap, int);
2281       if (*fmt == 'd')
2282         int10_to_str((long) iarg, nbuf, -10);
2283       else
2284         int10_to_str((long) (uint) iarg,nbuf,10);
2285 
2286       for (; pbuf[0]; pbuf++)
2287       {
2288         *dst++= '\0';
2289         *dst++= '\0';
2290         *dst++= '\0';
2291         *dst++= *pbuf;
2292       }
2293       continue;
2294     }
2295 
2296     /* We come here on '%%', unknown code or too long parameter */
2297     if (dst == end)
2298       break;
2299     *dst++= '\0';
2300     *dst++= '\0';
2301     *dst++= '\0';
2302     *dst++= '%';    /* % used as % or unknown code */
2303   }
2304 
2305   DBUG_ASSERT(dst < end);
2306   *dst++= '\0';
2307   *dst++= '\0';
2308   *dst++= '\0';
2309   *dst++= '\0';     /* End of errmessage */
2310   return (size_t) (dst - start - 4);
2311 }
2312 
2313 
2314 static size_t
my_snprintf_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * to,size_t n,const char * fmt,...)2315 my_snprintf_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2316                   char* to, size_t n, const char* fmt, ...)
2317 {
2318   size_t retval;
2319   va_list args;
2320   va_start(args,fmt);
2321   retval= my_vsnprintf_utf32(to, n, fmt, args);
2322   va_end(args);
2323   return retval;
2324 }
2325 
2326 
2327 static longlong
my_strtoll10_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * nptr,char ** endptr,int * error)2328 my_strtoll10_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2329                    const char *nptr, char **endptr, int *error)
2330 {
2331   const char *s, *end, *start, *n_end, *true_end;
2332   uchar c;
2333   unsigned long i, j, k;
2334   ulonglong li;
2335   int negative;
2336   ulong cutoff, cutoff2, cutoff3;
2337 
2338   s= nptr;
2339   /* If fixed length string */
2340   if (endptr)
2341   {
2342     /* Make sure string length is even */
2343     end= s + ((*endptr - s) / 4) * 4;
2344     while (s < end && !s[0] && !s[1] && !s[2] &&
2345            (s[3] == ' ' || s[3] == '\t'))
2346       s+= 4;
2347     if (s == end)
2348       goto no_conv;
2349   }
2350   else
2351   {
2352      /* We don't support null terminated strings in UCS2 */
2353      goto no_conv;
2354   }
2355 
2356   /* Check for a sign. */
2357   negative= 0;
2358   if (!s[0] && !s[1] && !s[2] && s[3] == '-')
2359   {
2360     *error= -1;                                        /* Mark as negative number */
2361     negative= 1;
2362     s+= 4;
2363     if (s == end)
2364       goto no_conv;
2365     cutoff=  MAX_NEGATIVE_NUMBER / LFACTOR2;
2366     cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
2367     cutoff3=  MAX_NEGATIVE_NUMBER % 100;
2368   }
2369   else
2370   {
2371     *error= 0;
2372     if (!s[0] && !s[1] && !s[2] && s[3] == '+')
2373     {
2374       s+= 4;
2375       if (s == end)
2376         goto no_conv;
2377     }
2378     cutoff=  ULONGLONG_MAX / LFACTOR2;
2379     cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
2380     cutoff3=  ULONGLONG_MAX % 100;
2381   }
2382 
2383   /* Handle case where we have a lot of pre-zero */
2384   if (!s[0] && !s[1] && !s[2] && s[3] == '0')
2385   {
2386     i= 0;
2387     do
2388     {
2389       s+= 4;
2390       if (s == end)
2391         goto end_i;                                /* Return 0 */
2392     }
2393     while (!s[0] && !s[1] && !s[2] && s[3] == '0');
2394     n_end= s + 4 * INIT_CNT;
2395   }
2396   else
2397   {
2398     /* Read first digit to check that it's a valid number */
2399     if (s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2400       goto no_conv;
2401     i= c;
2402     s+= 4;
2403     n_end= s + 4 * (INIT_CNT-1);
2404   }
2405 
2406   /* Handle first 9 digits and store them in i */
2407   if (n_end > end)
2408     n_end= end;
2409   for (; s != n_end ; s+= 4)
2410   {
2411     if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2412       goto end_i;
2413     i= i * 10 + c;
2414   }
2415   if (s == end)
2416     goto end_i;
2417 
2418   /* Handle next 9 digits and store them in j */
2419   j= 0;
2420   start= s;                                /* Used to know how much to shift i */
2421   n_end= true_end= s + 4 * INIT_CNT;
2422   if (n_end > end)
2423     n_end= end;
2424   do
2425   {
2426     if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2427       goto end_i_and_j;
2428     j= j * 10 + c;
2429     s+= 4;
2430   } while (s != n_end);
2431   if (s == end)
2432   {
2433     if (s != true_end)
2434       goto end_i_and_j;
2435     goto end3;
2436   }
2437   if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2438     goto end3;
2439 
2440   /* Handle the next 1 or 2 digits and store them in k */
2441   k=c;
2442   s+= 4;
2443   if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2444     goto end4;
2445   k= k * 10 + c;
2446   s+= 2;
2447   *endptr= (char*) s;
2448 
2449   /* number string should have ended here */
2450   if (s != end && !s[0] && !s[1] && !s[2] && (c= (s[3] - '0')) <= 9)
2451     goto overflow;
2452 
2453   /* Check that we didn't get an overflow with the last digit */
2454   if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
2455                                      k > cutoff3)))
2456     goto overflow;
2457   li= i * LFACTOR2+ (ulonglong) j * 100 + k;
2458   return (longlong) li;
2459 
2460 overflow:                                        /* *endptr is set here */
2461   *error= MY_ERRNO_ERANGE;
2462   return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
2463 
2464 end_i:
2465   *endptr= (char*) s;
2466   return (negative ? ((longlong) -(long) i) : (longlong) i);
2467 
2468 end_i_and_j:
2469   li= (ulonglong) i * lfactor[(size_t) (s-start) / 4] + j;
2470   *endptr= (char*) s;
2471   return (negative ? -((longlong) li) : (longlong) li);
2472 
2473 end3:
2474   li= (ulonglong) i*LFACTOR+ (ulonglong) j;
2475   *endptr= (char*) s;
2476   return (negative ? -((longlong) li) : (longlong) li);
2477 
2478 end4:
2479   li= (ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
2480   *endptr= (char*) s;
2481   if (negative)
2482   {
2483    if (li > MAX_NEGATIVE_NUMBER)
2484      goto overflow;
2485    return -((longlong) li);
2486   }
2487   return (longlong) li;
2488 
2489 no_conv:
2490   /* There was no number to convert.  */
2491   *error= MY_ERRNO_EDOM;
2492   *endptr= (char *) nptr;
2493   return 0;
2494 }
2495 
2496 
2497 static size_t
my_numchars_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e)2498 my_numchars_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2499                   const char *b, const char *e)
2500 {
2501   return (size_t) (e - b) / 4;
2502 }
2503 
2504 
2505 static size_t
my_charpos_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t pos)2506 my_charpos_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2507                  const char *b, const char *e, size_t pos)
2508 {
2509   size_t string_length= (size_t) (e - b);
2510   return pos * 4 > string_length ? string_length + 4 : pos * 4;
2511 }
2512 
2513 
2514 static size_t
my_well_formed_len_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t nchars,int * error)2515 my_well_formed_len_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2516                          const char *b, const char *e,
2517                          size_t nchars, int *error)
2518 {
2519   /* Ensure string length is divisible by 4 */
2520   const char *b0= b;
2521   size_t length= e - b;
2522   DBUG_ASSERT((length % 4) == 0);
2523   *error= 0;
2524   nchars*= 4;
2525   if (length > nchars)
2526   {
2527     length= nchars;
2528     e= b + nchars;
2529   }
2530   for (; b < e; b+= 4)
2531   {
2532     /* Don't accept characters greater than U+10FFFF */
2533     if (b[0] || (uchar) b[1] > 0x10)
2534     {
2535       *error= 1;
2536       return b - b0;
2537     }
2538   }
2539   return length;
2540 }
2541 
2542 
2543 static
my_fill_utf32(const CHARSET_INFO * cs,char * s,size_t slen,int fill)2544 void my_fill_utf32(const CHARSET_INFO *cs,
2545                    char *s, size_t slen, int fill)
2546 {
2547   char buf[10];
2548   char *e= s + slen;
2549 
2550   DBUG_ASSERT((slen % 4) == 0);
2551   {
2552 #ifndef DBUG_OFF
2553     uint buflen=
2554 #endif
2555       cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
2556                       (uchar*) buf + sizeof(buf));
2557     DBUG_ASSERT(buflen == 4);
2558   }
2559   while (s < e)
2560   {
2561     memcpy(s, buf, 4);
2562     s+= 4;
2563   }
2564 }
2565 
2566 
2567 static size_t
my_lengthsp_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)2568 my_lengthsp_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2569                   const char *ptr, size_t length)
2570 {
2571   const char *end= ptr + length;
2572   DBUG_ASSERT((length % 4) == 0);
2573   while (end > ptr + 3 && end[-1] == ' ' && !end[-2] && !end[-3] && !end[-4])
2574     end-= 4;
2575   return (size_t) (end - ptr);
2576 }
2577 
2578 
2579 static int
my_wildcmp_utf32_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2580 my_wildcmp_utf32_ci(const CHARSET_INFO *cs,
2581                     const char *str, const char *str_end,
2582                     const char *wildstr, const char *wildend,
2583                     int escape, int w_one, int w_many)
2584 {
2585   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2586   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2587                             escape, w_one, w_many, uni_plane);
2588 }
2589 
2590 
2591 static int
my_wildcmp_utf32_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2592 my_wildcmp_utf32_bin(const CHARSET_INFO *cs,
2593                      const char *str,const char *str_end,
2594                      const char *wildstr,const char *wildend,
2595                      int escape, int w_one, int w_many)
2596 {
2597   return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2598                             escape, w_one, w_many, NULL);
2599 }
2600 
2601 
2602 static int
my_strnncoll_utf32_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)2603 my_strnncoll_utf32_bin(const CHARSET_INFO *cs,
2604                        const uchar *s, size_t slen,
2605                        const uchar *t, size_t tlen,
2606                        my_bool t_is_prefix)
2607 {
2608   my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
2609   const uchar *se= s + slen;
2610   const uchar *te= t + tlen;
2611 
2612   while (s < se && t < te)
2613   {
2614     int s_res= my_utf32_uni(cs, &s_wc, s, se);
2615     int t_res= my_utf32_uni(cs, &t_wc, t, te);
2616 
2617     if (s_res <= 0 || t_res <= 0)
2618     {
2619       /* Incorrect string, compare by char value */
2620       return my_bincmp(s, se, t, te);
2621     }
2622     if (s_wc != t_wc)
2623     {
2624       return  s_wc > t_wc ? 1 : -1;
2625     }
2626 
2627     s+= s_res;
2628     t+= t_res;
2629   }
2630   return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t)));
2631 }
2632 
2633 
2634 static inline my_wc_t
my_utf32_get(const uchar * s)2635 my_utf32_get(const uchar *s)
2636 {
2637   return
2638     ((my_wc_t) s[0] << 24) +
2639     ((my_wc_t) s[1] << 16) +
2640     ((my_wc_t) s[2] << 8) +
2641     s[3];
2642 }
2643 
2644 
2645 static int
my_strnncollsp_utf32_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))2646 my_strnncollsp_utf32_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2647                          const uchar *s, size_t slen,
2648                          const uchar *t, size_t tlen,
2649                          my_bool diff_if_only_endspace_difference
2650                          MY_ATTRIBUTE((unused)))
2651 {
2652   const uchar *se, *te;
2653   size_t minlen;
2654 
2655   DBUG_ASSERT((slen % 4) == 0);
2656   DBUG_ASSERT((tlen % 4) == 0);
2657 
2658   se= s + slen;
2659   te= t + tlen;
2660 
2661   for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4)
2662   {
2663     my_wc_t s_wc= my_utf32_get(s);
2664     my_wc_t t_wc= my_utf32_get(t);
2665     if (s_wc != t_wc)
2666       return  s_wc > t_wc ? 1 : -1;
2667 
2668     s+= 4;
2669     t+= 4;
2670   }
2671 
2672   if (slen != tlen)
2673   {
2674     int swap= 1;
2675     if (slen < tlen)
2676     {
2677       s= t;
2678       se= te;
2679       swap= -1;
2680     }
2681 
2682     for ( ; s < se ; s+= 4)
2683     {
2684       my_wc_t s_wc= my_utf32_get(s);
2685       if (s_wc != ' ')
2686         return (s_wc < ' ') ? -swap : swap;
2687     }
2688   }
2689   return 0;
2690 }
2691 
2692 
2693 static size_t
my_scan_utf32(const CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)2694 my_scan_utf32(const CHARSET_INFO *cs,
2695               const char *str, const char *end, int sequence_type)
2696 {
2697   const char *str0= str;
2698 
2699   switch (sequence_type)
2700   {
2701   case MY_SEQ_SPACES:
2702     for ( ; str < end; )
2703     {
2704       my_wc_t wc;
2705       int res= my_utf32_uni(cs, &wc, (uchar*) str, (uchar*) end);
2706       if (res < 0 || wc != ' ')
2707         break;
2708       str+= res;
2709     }
2710     return (size_t) (str - str0);
2711   default:
2712     return 0;
2713   }
2714 }
2715 
2716 
2717 static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
2718 {
2719   NULL, /* init */
2720   my_strnncoll_utf32,
2721   my_strnncollsp_utf32,
2722   my_strnxfrm_unicode,
2723   my_strnxfrmlen_utf32,
2724   my_like_range_generic,
2725   my_wildcmp_utf32_ci,
2726   my_strcasecmp_mb2_or_mb4,
2727   my_instr_mb,
2728   my_hash_sort_utf32,
2729   my_propagate_simple
2730 };
2731 
2732 
2733 static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
2734 {
2735   NULL, /* init */
2736   my_strnncoll_utf32_bin,
2737   my_strnncollsp_utf32_bin,
2738   my_strnxfrm_unicode_full_bin,
2739   my_strnxfrmlen_unicode_full_bin,
2740   my_like_range_generic,
2741   my_wildcmp_utf32_bin,
2742   my_strcasecmp_mb2_or_mb4,
2743   my_instr_mb,
2744   my_hash_sort_utf32,
2745   my_propagate_simple
2746 };
2747 
2748 
2749 MY_CHARSET_HANDLER my_charset_utf32_handler=
2750 {
2751   NULL, /* init */
2752   my_ismbchar_utf32,
2753   my_mbcharlen_utf32,
2754   my_numchars_utf32,
2755   my_charpos_utf32,
2756   my_well_formed_len_utf32,
2757   my_lengthsp_utf32,
2758   my_numcells_mb,
2759   my_utf32_uni,
2760   my_uni_utf32,
2761   my_mb_ctype_mb,
2762   my_caseup_str_mb2_or_mb4,
2763   my_casedn_str_mb2_or_mb4,
2764   my_caseup_utf32,
2765   my_casedn_utf32,
2766   my_snprintf_utf32,
2767   my_l10tostr_mb2_or_mb4,
2768   my_ll10tostr_mb2_or_mb4,
2769   my_fill_utf32,
2770   my_strntol_mb2_or_mb4,
2771   my_strntoul_mb2_or_mb4,
2772   my_strntoll_mb2_or_mb4,
2773   my_strntoull_mb2_or_mb4,
2774   my_strntod_mb2_or_mb4,
2775   my_strtoll10_utf32,
2776   my_strntoull10rnd_mb2_or_mb4,
2777   my_scan_utf32
2778 };
2779 
2780 
2781 CHARSET_INFO my_charset_utf32_general_ci=
2782 {
2783   60,0,0,              /* number       */
2784   MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2785   "utf32",             /* cs name    */
2786   "utf32_general_ci",  /* name         */
2787   "UTF-32 Unicode",    /* comment      */
2788   NULL,                /* tailoring    */
2789   NULL,                /* ctype        */
2790   NULL,                /* to_lower     */
2791   NULL,                /* to_upper     */
2792   NULL,                /* sort_order   */
2793   NULL,                /* uca          */
2794   NULL,                /* tab_to_uni   */
2795   NULL,                /* tab_from_uni */
2796   &my_unicase_default, /* caseinfo     */
2797   NULL,                /* state_map    */
2798   NULL,                /* ident_map    */
2799   1,                   /* strxfrm_multiply */
2800   1,                   /* caseup_multiply  */
2801   1,                   /* casedn_multiply  */
2802   4,                   /* mbminlen     */
2803   4,                   /* mbmaxlen     */
2804   0,                   /* min_sort_char */
2805   0xFFFF,              /* max_sort_char */
2806   ' ',                 /* pad char      */
2807   0,                   /* escape_with_backslash_is_dangerous */
2808   1,                   /* levels_for_compare */
2809   1,                   /* levels_for_order   */
2810   &my_charset_utf32_handler,
2811   &my_collation_utf32_general_ci_handler
2812 };
2813 
2814 
2815 CHARSET_INFO my_charset_utf32_bin=
2816 {
2817   61,0,0,              /* number       */
2818   MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2819   "utf32",             /* cs name    */
2820   "utf32_bin",         /* name         */
2821   "UTF-32 Unicode",    /* comment      */
2822   NULL,                /* tailoring    */
2823   NULL,                /* ctype        */
2824   NULL,                /* to_lower     */
2825   NULL,                /* to_upper     */
2826   NULL,                /* sort_order   */
2827   NULL,                /* uca          */
2828   NULL,                /* tab_to_uni   */
2829   NULL,                /* tab_from_uni */
2830   &my_unicase_default, /* caseinfo     */
2831   NULL,                /* state_map    */
2832   NULL,                /* ident_map    */
2833   1,                   /* strxfrm_multiply */
2834   1,                   /* caseup_multiply  */
2835   1,                   /* casedn_multiply  */
2836   4,                   /* mbminlen     */
2837   4,                   /* mbmaxlen     */
2838   0,                   /* min_sort_char */
2839   0xFFFF,              /* max_sort_char */
2840   ' ',                 /* pad char      */
2841   0,                   /* escape_with_backslash_is_dangerous */
2842   1,                   /* levels_for_compare */
2843   1,                   /* levels_for_order   */
2844   &my_charset_utf32_handler,
2845   &my_collation_utf32_bin_handler
2846 };
2847 
2848 
2849 #endif /* HAVE_CHARSET_utf32 */
2850 
2851 
2852 #ifdef HAVE_CHARSET_ucs2
2853 
2854 static uchar ctype_ucs2[] = {
2855     0,
2856    32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
2857    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2858    72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
2859   132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
2860    16,129,129,129,129,129,129,  1,  1,  1,  1,  1,  1,  1,  1,  1,
2861     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 16, 16, 16, 16, 16,
2862    16,130,130,130,130,130,130,  2,  2,  2,  2,  2,  2,  2,  2,  2,
2863     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 16, 16, 16, 16, 32,
2864     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2865     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2866     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2867     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2868     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2869     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2870     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2871     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
2872 };
2873 
2874 static uchar to_lower_ucs2[] = {
2875     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
2876    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2877    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2878    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2879    64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2880   112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
2881    96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2882   112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2883   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2884   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2885   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2886   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2887   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2888   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2889   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2890   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2891 };
2892 
2893 static uchar to_upper_ucs2[] = {
2894     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
2895    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2896    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2897    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2898    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2899    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2900    96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2901    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
2902   128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2903   144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2904   160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2905   176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2906   192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2907   208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2908   224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2909   240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2910 };
2911 
2912 
my_ucs2_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)2913 static int my_ucs2_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2914 		       my_wc_t * pwc, const uchar *s, const uchar *e)
2915 {
2916   if (s+2 > e) /* Need 2 characters */
2917     return MY_CS_TOOSMALL2;
2918 
2919   *pwc= ((uchar)s[0]) * 256  + ((uchar)s[1]);
2920   return 2;
2921 }
2922 
my_uni_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * r,uchar * e)2923 static int my_uni_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
2924 		       my_wc_t wc, uchar *r, uchar *e)
2925 {
2926   if ( r+2 > e )
2927     return MY_CS_TOOSMALL2;
2928 
2929   if (wc > 0xFFFF) /* UCS2 does not support characters outside BMP */
2930     return MY_CS_ILUNI;
2931 
2932   r[0]= (uchar) (wc >> 8);
2933   r[1]= (uchar) (wc & 0xFF);
2934   return 2;
2935 }
2936 
2937 
2938 static inline void
my_tolower_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2939 my_tolower_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2940 {
2941   MY_UNICASE_CHARACTER *page;
2942   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2943     *wc= page[*wc & 0xFF].tolower;
2944 }
2945 
2946 
2947 static inline void
my_toupper_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2948 my_toupper_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2949 {
2950   MY_UNICASE_CHARACTER *page;
2951   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2952     *wc= page[*wc & 0xFF].toupper;
2953 }
2954 
2955 
2956 static inline void
my_tosort_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2957 my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2958 {
2959   MY_UNICASE_CHARACTER *page;
2960   if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2961     *wc= page[*wc & 0xFF].sort;
2962 }
2963 
2964 
my_caseup_ucs2(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2965 static size_t my_caseup_ucs2(const CHARSET_INFO *cs, char *src, size_t srclen,
2966                            char *dst MY_ATTRIBUTE((unused)),
2967                            size_t dstlen MY_ATTRIBUTE((unused)))
2968 {
2969   my_wc_t wc;
2970   int res;
2971   char *srcend= src + srclen;
2972   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2973   DBUG_ASSERT(src == dst && srclen == dstlen);
2974 
2975   while ((src < srcend) &&
2976          (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
2977   {
2978     my_toupper_ucs2(uni_plane, &wc);
2979     if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
2980       break;
2981     src+= res;
2982   }
2983   return srclen;
2984 }
2985 
2986 
my_hash_sort_ucs2(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)2987 static void my_hash_sort_ucs2(const CHARSET_INFO *cs, const uchar *s,
2988                               size_t slen, ulong *n1, ulong *n2)
2989 {
2990   my_wc_t wc;
2991   int res;
2992   const uchar *e=s+slen;
2993   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2994 
2995   while (e > s+1 && e[-1] == ' ' && e[-2] == '\0')
2996     e-= 2;
2997 
2998   while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
2999   {
3000     my_tosort_ucs2(uni_plane, &wc);
3001     n1[0]^= (((n1[0] & 63)+n2[0])*(wc & 0xFF))+ (n1[0] << 8);
3002     n2[0]+=3;
3003     n1[0]^= (((n1[0] & 63)+n2[0])*(wc >> 8))+ (n1[0] << 8);
3004     n2[0]+=3;
3005     s+=res;
3006   }
3007 }
3008 
3009 
my_casedn_ucs2(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))3010 static size_t my_casedn_ucs2(const CHARSET_INFO *cs, char *src, size_t srclen,
3011                            char *dst MY_ATTRIBUTE((unused)),
3012                            size_t dstlen MY_ATTRIBUTE((unused)))
3013 {
3014   my_wc_t wc;
3015   int res;
3016   char *srcend= src + srclen;
3017   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3018   DBUG_ASSERT(src == dst && srclen == dstlen);
3019 
3020   while ((src < srcend) &&
3021          (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
3022   {
3023     my_tolower_ucs2(uni_plane, &wc);
3024     if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
3025       break;
3026     src+= res;
3027   }
3028   return srclen;
3029 }
3030 
3031 
3032 static void
my_fill_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s,size_t l,int fill)3033 my_fill_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3034              char *s, size_t l, int fill)
3035 {
3036   DBUG_ASSERT(fill <= 0xFFFF);
3037   for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
3038 }
3039 
3040 
my_strnncoll_ucs2(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)3041 static int my_strnncoll_ucs2(const CHARSET_INFO *cs,
3042 			     const uchar *s, size_t slen,
3043                              const uchar *t, size_t tlen,
3044                              my_bool t_is_prefix)
3045 {
3046   int s_res,t_res;
3047   my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
3048   const uchar *se=s+slen;
3049   const uchar *te=t+tlen;
3050   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3051 
3052   while ( s < se && t < te )
3053   {
3054     s_res=my_ucs2_uni(cs,&s_wc, s, se);
3055     t_res=my_ucs2_uni(cs,&t_wc, t, te);
3056 
3057     if ( s_res <= 0 || t_res <= 0 )
3058     {
3059       /* Incorrect string, compare by char value */
3060       return ((int)s[0]-(int)t[0]);
3061     }
3062 
3063     my_tosort_ucs2(uni_plane, &s_wc);
3064     my_tosort_ucs2(uni_plane, &t_wc);
3065 
3066     if ( s_wc != t_wc )
3067     {
3068       return  s_wc > t_wc ? 1 : -1;
3069     }
3070 
3071     s+=s_res;
3072     t+=t_res;
3073   }
3074   return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
3075 }
3076 
3077 /*
3078   Compare strings, discarding end space
3079 
3080   SYNOPSIS
3081     my_strnncollsp_ucs2()
3082     cs                  character set handler
3083     a                   First string to compare
3084     a_length            Length of 'a'
3085     b                   Second string to compare
3086     b_length            Length of 'b'
3087 
3088   IMPLEMENTATION
3089     If one string is shorter as the other, then we space extend the other
3090     so that the strings have equal length.
3091 
3092     This will ensure that the following things hold:
3093 
3094     "a"  == "a "
3095     "a\0" < "a"
3096     "a\0" < "a "
3097 
3098   RETURN
3099     < 0  a <  b
3100     = 0  a == b
3101     > 0  a > b
3102 */
3103 
my_strnncollsp_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))3104 static int my_strnncollsp_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3105                                const uchar *s, size_t slen,
3106                                const uchar *t, size_t tlen,
3107                                my_bool diff_if_only_endspace_difference
3108 			       MY_ATTRIBUTE((unused)))
3109 {
3110   const uchar *se, *te;
3111   size_t minlen;
3112   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3113 
3114   /* extra safety to make sure the lengths are even numbers */
3115   slen&= ~1;
3116   tlen&= ~1;
3117 
3118   se= s + slen;
3119   te= t + tlen;
3120 
3121   for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
3122   {
3123     int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort :
3124                                        (((int) s[0]) << 8) + (int) s[1];
3125 
3126     int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort :
3127                                        (((int) t[0]) << 8) + (int) t[1];
3128     if ( s_wc != t_wc )
3129       return  s_wc > t_wc ? 1 : -1;
3130 
3131     s+= 2;
3132     t+= 2;
3133   }
3134 
3135   if (slen != tlen)
3136   {
3137     int swap= 1;
3138     if (slen < tlen)
3139     {
3140       s= t;
3141       se= te;
3142       swap= -1;
3143     }
3144 
3145     for ( ; s < se ; s+= 2)
3146     {
3147       if (s[0] || s[1] != ' ')
3148         return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
3149     }
3150   }
3151   return 0;
3152 }
3153 
3154 
my_ismbchar_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)))3155 static uint my_ismbchar_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3156                              const char *b MY_ATTRIBUTE((unused)),
3157                              const char *e MY_ATTRIBUTE((unused)))
3158 {
3159   return 2;
3160 }
3161 
3162 
my_mbcharlen_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))3163 static uint my_mbcharlen_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
3164                               uint c MY_ATTRIBUTE((unused)))
3165 {
3166   return 2;
3167 }
3168 
3169 
3170 static
my_numchars_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e)3171 size_t my_numchars_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3172                         const char *b, const char *e)
3173 {
3174   return (size_t) (e-b)/2;
3175 }
3176 
3177 
3178 static
my_charpos_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)),size_t pos)3179 size_t my_charpos_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3180                        const char *b  MY_ATTRIBUTE((unused)),
3181                        const char *e  MY_ATTRIBUTE((unused)),
3182                        size_t pos)
3183 {
3184   size_t string_length= (size_t) (e - b);
3185   return pos > string_length ? string_length + 2 : pos * 2;
3186 }
3187 
3188 
3189 static
my_well_formed_len_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t nchars,int * error)3190 size_t my_well_formed_len_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3191                                const char *b, const char *e,
3192                                size_t nchars, int *error)
3193 {
3194   /* Ensure string length is dividable with 2 */
3195   size_t nbytes= ((size_t) (e-b)) & ~(size_t) 1;
3196   *error= 0;
3197   nchars*= 2;
3198   return MY_MIN(nbytes, nchars);
3199 }
3200 
3201 
3202 static
my_wildcmp_ucs2_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3203 int my_wildcmp_ucs2_ci(const CHARSET_INFO *cs,
3204 		    const char *str,const char *str_end,
3205 		    const char *wildstr,const char *wildend,
3206 		    int escape, int w_one, int w_many)
3207 {
3208   MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3209   return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3210                             escape,w_one,w_many,uni_plane);
3211 }
3212 
3213 
3214 static
my_wildcmp_ucs2_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3215 int my_wildcmp_ucs2_bin(const CHARSET_INFO *cs,
3216 		    const char *str,const char *str_end,
3217 		    const char *wildstr,const char *wildend,
3218 		    int escape, int w_one, int w_many)
3219 {
3220   return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3221                             escape,w_one,w_many,NULL);
3222 }
3223 
3224 
3225 static
my_strnncoll_ucs2_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)3226 int my_strnncoll_ucs2_bin(const CHARSET_INFO *cs,
3227                           const uchar *s, size_t slen,
3228                           const uchar *t, size_t tlen,
3229                           my_bool t_is_prefix)
3230 {
3231   int s_res,t_res;
3232   my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
3233   const uchar *se=s+slen;
3234   const uchar *te=t+tlen;
3235 
3236   while ( s < se && t < te )
3237   {
3238     s_res=my_ucs2_uni(cs,&s_wc, s, se);
3239     t_res=my_ucs2_uni(cs,&t_wc, t, te);
3240 
3241     if ( s_res <= 0 || t_res <= 0 )
3242     {
3243       /* Incorrect string, compare by char value */
3244       return ((int)s[0]-(int)t[0]);
3245     }
3246     if ( s_wc != t_wc )
3247     {
3248       return  s_wc > t_wc ? 1 : -1;
3249     }
3250 
3251     s+=s_res;
3252     t+=t_res;
3253   }
3254   return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
3255 }
3256 
my_strnncollsp_ucs2_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))3257 static int my_strnncollsp_ucs2_bin(const CHARSET_INFO *cs
3258                                    MY_ATTRIBUTE((unused)),
3259                                    const uchar *s, size_t slen,
3260                                    const uchar *t, size_t tlen,
3261                                    my_bool diff_if_only_endspace_difference
3262                                    MY_ATTRIBUTE((unused)))
3263 {
3264   const uchar *se, *te;
3265   size_t minlen;
3266 
3267   /* extra safety to make sure the lengths are even numbers */
3268   slen= (slen >> 1) << 1;
3269   tlen= (tlen >> 1) << 1;
3270 
3271   se= s + slen;
3272   te= t + tlen;
3273 
3274   for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
3275   {
3276     int s_wc= s[0] * 256 + s[1];
3277     int t_wc= t[0] * 256 + t[1];
3278     if ( s_wc != t_wc )
3279       return  s_wc > t_wc ? 1 : -1;
3280 
3281     s+= 2;
3282     t+= 2;
3283   }
3284 
3285   if (slen != tlen)
3286   {
3287     int swap= 1;
3288     if (slen < tlen)
3289     {
3290       s= t;
3291       se= te;
3292       swap= -1;
3293     }
3294 
3295     for ( ; s < se ; s+= 2)
3296     {
3297       if (s[0] || s[1] != ' ')
3298         return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
3299     }
3300   }
3301   return 0;
3302 }
3303 
3304 
3305 static
my_hash_sort_ucs2_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * key,size_t len,ulong * nr1,ulong * nr2)3306 void my_hash_sort_ucs2_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3307 			   const uchar *key, size_t len,ulong *nr1, ulong *nr2)
3308 {
3309   const uchar *pos = key;
3310 
3311   key+= len;
3312 
3313   while (key > pos+1 && key[-1] == ' ' && key[-2] == '\0')
3314     key-= 2;
3315 
3316   for (; pos < (uchar*) key ; pos++)
3317   {
3318     nr1[0]^=(ulong) ((((uint) nr1[0] & 63)+nr2[0]) *
3319 	     ((uint)*pos)) + (nr1[0] << 8);
3320     nr2[0]+=3;
3321   }
3322 }
3323 
3324 
3325 static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
3326 {
3327     NULL,		/* init */
3328     my_strnncoll_ucs2,
3329     my_strnncollsp_ucs2,
3330     my_strnxfrm_unicode,
3331     my_strnxfrmlen_simple,
3332     my_like_range_generic,
3333     my_wildcmp_ucs2_ci,
3334     my_strcasecmp_mb2_or_mb4,
3335     my_instr_mb,
3336     my_hash_sort_ucs2,
3337     my_propagate_simple
3338 };
3339 
3340 
3341 static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
3342 {
3343     NULL,		/* init */
3344     my_strnncoll_ucs2_bin,
3345     my_strnncollsp_ucs2_bin,
3346     my_strnxfrm_unicode,
3347     my_strnxfrmlen_simple,
3348     my_like_range_generic,
3349     my_wildcmp_ucs2_bin,
3350     my_strcasecmp_mb2_or_mb4,
3351     my_instr_mb,
3352     my_hash_sort_ucs2_bin,
3353     my_propagate_simple
3354 };
3355 
3356 
3357 MY_CHARSET_HANDLER my_charset_ucs2_handler=
3358 {
3359     NULL,		/* init */
3360     my_ismbchar_ucs2,	/* ismbchar     */
3361     my_mbcharlen_ucs2,	/* mbcharlen    */
3362     my_numchars_ucs2,
3363     my_charpos_ucs2,
3364     my_well_formed_len_ucs2,
3365     my_lengthsp_mb2,
3366     my_numcells_mb,
3367     my_ucs2_uni,	/* mb_wc        */
3368     my_uni_ucs2,	/* wc_mb        */
3369     my_mb_ctype_mb,
3370     my_caseup_str_mb2_or_mb4,
3371     my_casedn_str_mb2_or_mb4,
3372     my_caseup_ucs2,
3373     my_casedn_ucs2,
3374     my_snprintf_mb2,
3375     my_l10tostr_mb2_or_mb4,
3376     my_ll10tostr_mb2_or_mb4,
3377     my_fill_ucs2,
3378     my_strntol_mb2_or_mb4,
3379     my_strntoul_mb2_or_mb4,
3380     my_strntoll_mb2_or_mb4,
3381     my_strntoull_mb2_or_mb4,
3382     my_strntod_mb2_or_mb4,
3383     my_strtoll10_mb2,
3384     my_strntoull10rnd_mb2_or_mb4,
3385     my_scan_mb2
3386 };
3387 
3388 
3389 CHARSET_INFO my_charset_ucs2_general_ci=
3390 {
3391     35,0,0,		/* number       */
3392     MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
3393     "ucs2",		/* cs name    */
3394     "ucs2_general_ci",	/* name         */
3395     "",			/* comment      */
3396     NULL,		/* tailoring    */
3397     ctype_ucs2,		/* ctype        */
3398     to_lower_ucs2,	/* to_lower     */
3399     to_upper_ucs2,	/* to_upper     */
3400     to_upper_ucs2,	/* sort_order   */
3401     NULL,		/* uca          */
3402     NULL,		/* tab_to_uni   */
3403     NULL,		/* tab_from_uni */
3404     &my_unicase_default,/* caseinfo     */
3405     NULL,		/* state_map    */
3406     NULL,		/* ident_map    */
3407     1,			/* strxfrm_multiply */
3408     1,                  /* caseup_multiply  */
3409     1,                  /* casedn_multiply  */
3410     2,			/* mbminlen     */
3411     2,			/* mbmaxlen     */
3412     0,			/* min_sort_char */
3413     0xFFFF,		/* max_sort_char */
3414     ' ',                /* pad char      */
3415     0,                  /* escape_with_backslash_is_dangerous */
3416     1,                  /* levels_for_compare */
3417     1,                  /* levels_for_order   */
3418     &my_charset_ucs2_handler,
3419     &my_collation_ucs2_general_ci_handler
3420 };
3421 
3422 
3423 CHARSET_INFO my_charset_ucs2_general_mysql500_ci=
3424 {
3425   159, 0, 0,                                       /* number           */
3426   MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, /* state */
3427   "ucs2",                                          /* cs name          */
3428   "ucs2_general_mysql500_ci",                      /* name             */
3429   "",                                              /* comment          */
3430   NULL,                                            /* tailoring        */
3431   ctype_ucs2,                                      /* ctype            */
3432   to_lower_ucs2,                                   /* to_lower         */
3433   to_upper_ucs2,                                   /* to_upper         */
3434   to_upper_ucs2,                                   /* sort_order       */
3435   NULL,                                            /* uca              */
3436   NULL,                                            /* tab_to_uni       */
3437   NULL,                                            /* tab_from_uni     */
3438   &my_unicase_mysql500,                            /* caseinfo         */
3439   NULL,                                            /* state_map        */
3440   NULL,                                            /* ident_map        */
3441   1,                                               /* strxfrm_multiply */
3442   1,                                               /* caseup_multiply  */
3443   1,                                               /* casedn_multiply  */
3444   2,                                               /* mbminlen         */
3445   2,                                               /* mbmaxlen         */
3446   0,                                               /* min_sort_char    */
3447   0xFFFF,                                          /* max_sort_char    */
3448   ' ',                                             /* pad char         */
3449   0,                          /* escape_with_backslash_is_dangerous    */
3450   1,                                               /* levels_for_compare */
3451   1,                                               /* levels_for_order   */
3452   &my_charset_ucs2_handler,
3453   &my_collation_ucs2_general_ci_handler
3454 };
3455 
3456 
3457 CHARSET_INFO my_charset_ucs2_bin=
3458 {
3459     90,0,0,		/* number       */
3460     MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
3461     "ucs2",		/* cs name    */
3462     "ucs2_bin",		/* name         */
3463     "",			/* comment      */
3464     NULL,		/* tailoring    */
3465     ctype_ucs2,		/* ctype        */
3466     to_lower_ucs2,	/* to_lower     */
3467     to_upper_ucs2,	/* to_upper     */
3468     NULL,		/* sort_order   */
3469     NULL,		/* uca          */
3470     NULL,		/* tab_to_uni   */
3471     NULL,		/* tab_from_uni */
3472     &my_unicase_default,/* caseinfo     */
3473     NULL,		/* state_map    */
3474     NULL,		/* ident_map    */
3475     1,			/* strxfrm_multiply */
3476     1,                  /* caseup_multiply  */
3477     1,                  /* casedn_multiply  */
3478     2,			/* mbminlen     */
3479     2,			/* mbmaxlen     */
3480     0,			/* min_sort_char */
3481     0xFFFF,		/* max_sort_char */
3482     ' ',                /* pad char      */
3483     0,                  /* escape_with_backslash_is_dangerous */
3484     1,                  /* levels_for_compare */
3485     1,                  /* levels_for_order   */
3486     &my_charset_ucs2_handler,
3487     &my_collation_ucs2_bin_handler
3488 };
3489 
3490 
3491 #endif /* HAVE_CHARSET_ucs2 */
3492