1 /* Copyright (c) 2003, 2021, Oracle and/or its affiliates.
2
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Library General Public
5 License as published by the Free Software Foundation; version 2
6 of the License.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Library General Public License for more details.
12
13 You should have received a copy of the GNU Library General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
16
17 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
18
19 #include <my_global.h>
20 #include <my_sys.h>
21 #include "m_string.h"
22 #include "m_ctype.h"
23 #include <errno.h>
24 #include <stdarg.h>
25
26
27 #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
28 #define HAVE_CHARSET_mb2
29 #endif
30
31
32 #if defined(HAVE_CHARSET_mb2) || defined(HAVE_CHARSET_utf32)
33 #define HAVE_CHARSET_mb2_or_mb4
34 #endif
35
36
37 #ifndef EILSEQ
38 #define EILSEQ ENOENT
39 #endif
40
41 #define ULONGLONG_MAX (~(ulonglong) 0)
42 #define MAX_NEGATIVE_NUMBER ((ulonglong) 0x8000000000000000LL)
43 #define INIT_CNT 9
44 #define LFACTOR 1000000000ULL
45 #define LFACTOR1 10000000000ULL
46 #define LFACTOR2 100000000000ULL
47
48 #ifdef HAVE_CHARSET_mb2_or_mb4
49 static unsigned long lfactor[9]=
50 { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L };
51
52 static inline int
my_bincmp(const uchar * s,const uchar * se,const uchar * t,const uchar * te)53 my_bincmp(const uchar *s, const uchar *se,
54 const uchar *t, const uchar *te)
55 {
56 int slen= (int) (se - s), tlen= (int) (te - t);
57 int len= MY_MIN(slen, tlen);
58 int cmp= memcmp(s, t, len);
59 return cmp ? cmp : slen - tlen;
60 }
61
62
63 static size_t
my_caseup_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s MY_ATTRIBUTE ((unused)))64 my_caseup_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE((unused)),
65 char * s MY_ATTRIBUTE((unused)))
66 {
67 assert(0);
68 return 0;
69 }
70
71
72 static size_t
my_casedn_str_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s MY_ATTRIBUTE ((unused)))73 my_casedn_str_mb2_or_mb4(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
74 char * s MY_ATTRIBUTE((unused)))
75 {
76 assert(0);
77 return 0;
78 }
79
80
81 static int
my_strcasecmp_mb2_or_mb4(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * s MY_ATTRIBUTE ((unused)),const char * t MY_ATTRIBUTE ((unused)))82 my_strcasecmp_mb2_or_mb4(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
83 const char *s MY_ATTRIBUTE((unused)),
84 const char *t MY_ATTRIBUTE((unused)))
85 {
86 assert(0);
87 return 0;
88 }
89
90
91 static long
my_strntol_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)92 my_strntol_mb2_or_mb4(const CHARSET_INFO *cs,
93 const char *nptr, size_t l, int base,
94 char **endptr, int *err)
95 {
96 int negative= 0;
97 int overflow;
98 int cnv;
99 my_wc_t wc;
100 unsigned int cutlim;
101 uint32 cutoff;
102 uint32 res;
103 const uchar *s= (const uchar*) nptr;
104 const uchar *e= (const uchar*) nptr+l;
105 const uchar *save;
106
107 *err= 0;
108 do
109 {
110 if ((cnv= cs->cset->mb_wc(cs, &wc, s, e))>0)
111 {
112 switch (wc)
113 {
114 case ' ' : break;
115 case '\t': break;
116 case '-' : negative= !negative; break;
117 case '+' : break;
118 default : goto bs;
119 }
120 }
121 else /* No more characters or bad multibyte sequence */
122 {
123 if (endptr != NULL )
124 *endptr= (char*) s;
125 err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
126 return 0;
127 }
128 s+= cnv;
129 } while (1);
130
131 bs:
132
133 overflow= 0;
134 res= 0;
135 save= s;
136 cutoff= ((uint32)~0L) / (uint32) base;
137 cutlim= (uint) (((uint32)~0L) % (uint32) base);
138
139 do {
140 if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
141 {
142 s+= cnv;
143 if (wc >= '0' && wc <= '9')
144 wc-= '0';
145 else if (wc >= 'A' && wc <= 'Z')
146 wc= wc - 'A' + 10;
147 else if (wc >= 'a' && wc <= 'z')
148 wc= wc - 'a' + 10;
149 else
150 break;
151 if ((int)wc >= base)
152 break;
153 if (res > cutoff || (res == cutoff && wc > cutlim))
154 overflow= 1;
155 else
156 {
157 res*= (uint32) base;
158 res+= wc;
159 }
160 }
161 else if (cnv == MY_CS_ILSEQ)
162 {
163 if (endptr !=NULL )
164 *endptr = (char*) s;
165 err[0]= EILSEQ;
166 return 0;
167 }
168 else
169 {
170 /* No more characters */
171 break;
172 }
173 } while(1);
174
175 if (endptr != NULL)
176 *endptr = (char *) s;
177
178 if (s == save)
179 {
180 err[0]= EDOM;
181 return 0L;
182 }
183
184 if (negative)
185 {
186 if (res > (uint32) INT_MIN32)
187 overflow= 1;
188 }
189 else if (res > INT_MAX32)
190 overflow= 1;
191
192 if (overflow)
193 {
194 err[0]= ERANGE;
195 return negative ? INT_MIN32 : INT_MAX32;
196 }
197
198 return (negative ? -((long) res) : (long) res);
199 }
200
201
202 static ulong
my_strntoul_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)203 my_strntoul_mb2_or_mb4(const CHARSET_INFO *cs,
204 const char *nptr, size_t l, int base,
205 char **endptr, int *err)
206 {
207 int negative= 0;
208 int overflow;
209 int cnv;
210 my_wc_t wc;
211 unsigned int cutlim;
212 uint32 cutoff;
213 uint32 res;
214 const uchar *s= (const uchar*) nptr;
215 const uchar *e= (const uchar*) nptr + l;
216 const uchar *save;
217
218 *err= 0;
219 do
220 {
221 if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
222 {
223 switch (wc)
224 {
225 case ' ' : break;
226 case '\t': break;
227 case '-' : negative= !negative; break;
228 case '+' : break;
229 default : goto bs;
230 }
231 }
232 else /* No more characters or bad multibyte sequence */
233 {
234 if (endptr !=NULL )
235 *endptr= (char*)s;
236 err[0]= (cnv == MY_CS_ILSEQ) ? EILSEQ : EDOM;
237 return 0;
238 }
239 s+= cnv;
240 } while (1);
241
242 bs:
243
244 overflow= 0;
245 res= 0;
246 save= s;
247 cutoff= ((uint32)~0L) / (uint32) base;
248 cutlim= (uint) (((uint32)~0L) % (uint32) base);
249
250 do
251 {
252 if ((cnv= cs->cset->mb_wc(cs, &wc, s, e)) > 0)
253 {
254 s+= cnv;
255 if (wc >= '0' && wc <= '9')
256 wc-= '0';
257 else if (wc >= 'A' && wc <= 'Z')
258 wc= wc - 'A' + 10;
259 else if (wc >= 'a' && wc <= 'z')
260 wc= wc - 'a' + 10;
261 else
262 break;
263 if ((int) wc >= base)
264 break;
265 if (res > cutoff || (res == cutoff && wc > cutlim))
266 overflow = 1;
267 else
268 {
269 res*= (uint32) base;
270 res+= wc;
271 }
272 }
273 else if (cnv == MY_CS_ILSEQ)
274 {
275 if (endptr != NULL )
276 *endptr= (char*)s;
277 err[0]= EILSEQ;
278 return 0;
279 }
280 else
281 {
282 /* No more characters */
283 break;
284 }
285 } while(1);
286
287 if (endptr != NULL)
288 *endptr= (char *) s;
289
290 if (s == save)
291 {
292 err[0]= EDOM;
293 return 0L;
294 }
295
296 if (overflow)
297 {
298 err[0]= (ERANGE);
299 return (~(uint32) 0);
300 }
301
302 return (negative ? -((long) res) : (long) res);
303 }
304
305
306 static longlong
my_strntoll_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)307 my_strntoll_mb2_or_mb4(const CHARSET_INFO *cs,
308 const char *nptr, size_t l, int base,
309 char **endptr, int *err)
310 {
311 int negative=0;
312 int overflow;
313 int cnv;
314 my_wc_t wc;
315 ulonglong cutoff;
316 unsigned int cutlim;
317 ulonglong res;
318 const uchar *s= (const uchar*) nptr;
319 const uchar *e= (const uchar*) nptr+l;
320 const uchar *save;
321
322 *err= 0;
323 do
324 {
325 if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
326 {
327 switch (wc)
328 {
329 case ' ' : break;
330 case '\t': break;
331 case '-' : negative= !negative; break;
332 case '+' : break;
333 default : goto bs;
334 }
335 }
336 else /* No more characters or bad multibyte sequence */
337 {
338 if (endptr !=NULL )
339 *endptr = (char*)s;
340 err[0] = (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
341 return 0;
342 }
343 s+=cnv;
344 } while (1);
345
346 bs:
347
348 overflow = 0;
349 res = 0;
350 save = s;
351 cutoff = (~(ulonglong) 0) / (unsigned long int) base;
352 cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
353
354 do {
355 if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
356 {
357 s+=cnv;
358 if ( wc>='0' && wc<='9')
359 wc -= '0';
360 else if ( wc>='A' && wc<='Z')
361 wc = wc - 'A' + 10;
362 else if ( wc>='a' && wc<='z')
363 wc = wc - 'a' + 10;
364 else
365 break;
366 if ((int)wc >= base)
367 break;
368 if (res > cutoff || (res == cutoff && wc > cutlim))
369 overflow = 1;
370 else
371 {
372 res *= (ulonglong) base;
373 res += wc;
374 }
375 }
376 else if (cnv==MY_CS_ILSEQ)
377 {
378 if (endptr !=NULL )
379 *endptr = (char*)s;
380 err[0]=EILSEQ;
381 return 0;
382 }
383 else
384 {
385 /* No more characters */
386 break;
387 }
388 } while(1);
389
390 if (endptr != NULL)
391 *endptr = (char *) s;
392
393 if (s == save)
394 {
395 err[0]=EDOM;
396 return 0L;
397 }
398
399 if (negative)
400 {
401 if (res > (ulonglong) LLONG_MIN)
402 overflow = 1;
403 }
404 else if (res > (ulonglong) LLONG_MAX)
405 overflow = 1;
406
407 if (overflow)
408 {
409 err[0]=ERANGE;
410 return negative ? LLONG_MIN : LLONG_MAX;
411 }
412
413 return (negative ? -((longlong)res) : (longlong)res);
414 }
415
416
417 static ulonglong
my_strntoull_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)418 my_strntoull_mb2_or_mb4(const CHARSET_INFO *cs,
419 const char *nptr, size_t l, int base,
420 char **endptr, int *err)
421 {
422 int negative= 0;
423 int overflow;
424 int cnv;
425 my_wc_t wc;
426 ulonglong cutoff;
427 unsigned int cutlim;
428 ulonglong res;
429 const uchar *s= (const uchar*) nptr;
430 const uchar *e= (const uchar*) nptr + l;
431 const uchar *save;
432
433 *err= 0;
434 do
435 {
436 if ((cnv= cs->cset->mb_wc(cs,&wc,s,e)) > 0)
437 {
438 switch (wc)
439 {
440 case ' ' : break;
441 case '\t': break;
442 case '-' : negative= !negative; break;
443 case '+' : break;
444 default : goto bs;
445 }
446 }
447 else /* No more characters or bad multibyte sequence */
448 {
449 if (endptr !=NULL )
450 *endptr = (char*)s;
451 err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
452 return 0;
453 }
454 s+=cnv;
455 } while (1);
456
457 bs:
458
459 overflow = 0;
460 res = 0;
461 save = s;
462 cutoff = (~(ulonglong) 0) / (unsigned long int) base;
463 cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
464
465 do
466 {
467 if ((cnv=cs->cset->mb_wc(cs,&wc,s,e))>0)
468 {
469 s+=cnv;
470 if ( wc>='0' && wc<='9')
471 wc -= '0';
472 else if ( wc>='A' && wc<='Z')
473 wc = wc - 'A' + 10;
474 else if ( wc>='a' && wc<='z')
475 wc = wc - 'a' + 10;
476 else
477 break;
478 if ((int)wc >= base)
479 break;
480 if (res > cutoff || (res == cutoff && wc > cutlim))
481 overflow = 1;
482 else
483 {
484 res *= (ulonglong) base;
485 res += wc;
486 }
487 }
488 else if (cnv==MY_CS_ILSEQ)
489 {
490 if (endptr !=NULL )
491 *endptr = (char*)s;
492 err[0]= EILSEQ;
493 return 0;
494 }
495 else
496 {
497 /* No more characters */
498 break;
499 }
500 } while(1);
501
502 if (endptr != NULL)
503 *endptr = (char *) s;
504
505 if (s == save)
506 {
507 err[0]= EDOM;
508 return 0L;
509 }
510
511 if (overflow)
512 {
513 err[0]= ERANGE;
514 return (~(ulonglong) 0);
515 }
516
517 return (negative ? -((longlong) res) : (longlong) res);
518 }
519
520
521 static double
my_strntod_mb2_or_mb4(const CHARSET_INFO * cs,char * nptr,size_t length,char ** endptr,int * err)522 my_strntod_mb2_or_mb4(const CHARSET_INFO *cs,
523 char *nptr, size_t length,
524 char **endptr, int *err)
525 {
526 char buf[256];
527 double res;
528 char *b= buf;
529 const uchar *s= (const uchar*) nptr;
530 const uchar *end;
531 my_wc_t wc;
532 int cnv;
533
534 *err= 0;
535 /* Cut too long strings */
536 if (length >= sizeof(buf))
537 length= sizeof(buf) - 1;
538 end= s + length;
539
540 while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
541 {
542 s+= cnv;
543 if (wc > (int) (uchar) 'e' || !wc)
544 break; /* Can't be part of double */
545 *b++= (char) wc;
546 }
547
548 *endptr= b;
549 res= my_strtod(buf, endptr, err);
550 *endptr= nptr + cs->mbminlen * (size_t) (*endptr - buf);
551 return res;
552 }
553
554
555 static ulonglong
my_strntoull10rnd_mb2_or_mb4(const CHARSET_INFO * cs,const char * nptr,size_t length,int unsign_fl,char ** endptr,int * err)556 my_strntoull10rnd_mb2_or_mb4(const CHARSET_INFO *cs,
557 const char *nptr, size_t length,
558 int unsign_fl,
559 char **endptr, int *err)
560 {
561 char buf[256], *b= buf;
562 ulonglong res;
563 const uchar *end, *s= (const uchar*) nptr;
564 my_wc_t wc;
565 int cnv;
566
567 /* Cut too long strings */
568 if (length >= sizeof(buf))
569 length= sizeof(buf)-1;
570 end= s + length;
571
572 while ((cnv= cs->cset->mb_wc(cs,&wc,s,end)) > 0)
573 {
574 s+= cnv;
575 if (wc > (int) (uchar) 'e' || !wc)
576 break; /* Can't be a number part */
577 *b++= (char) wc;
578 }
579
580 res= my_strntoull10rnd_8bit(cs, buf, b - buf, unsign_fl, endptr, err);
581 *endptr= (char*) nptr + cs->mbminlen * (size_t) (*endptr - buf);
582 return res;
583 }
584
585
586 /*
587 This is a fast version optimized for the case of radix 10 / -10
588 */
589
590 static size_t
my_l10tostr_mb2_or_mb4(const CHARSET_INFO * cs,char * dst,size_t len,int radix,long int val)591 my_l10tostr_mb2_or_mb4(const CHARSET_INFO *cs,
592 char *dst, size_t len, int radix, long int val)
593 {
594 char buffer[66];
595 char *p, *db, *de;
596 long int new_val;
597 int sl= 0;
598 unsigned long int uval = (unsigned long int) val;
599
600 p= &buffer[sizeof(buffer) - 1];
601 *p= '\0';
602
603 if (radix < 0)
604 {
605 if (val < 0)
606 {
607 sl= 1;
608 /* Avoid integer overflow in (-val) for LLONG_MIN (BUG#31799). */
609 uval = (unsigned long int)0 - uval;
610 }
611 }
612
613 new_val = (long) (uval / 10);
614 *--p = '0'+ (char) (uval - (unsigned long) new_val * 10);
615 val= new_val;
616
617 while (val != 0)
618 {
619 new_val= val / 10;
620 *--p= '0' + (char) (val - new_val * 10);
621 val= new_val;
622 }
623
624 if (sl)
625 {
626 *--p= '-';
627 }
628
629 for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
630 {
631 int cnvres= cs->cset->wc_mb(cs,(my_wc_t)p[0],(uchar*) dst, (uchar*) de);
632 if (cnvres > 0)
633 dst+= cnvres;
634 else
635 break;
636 }
637 return (int) (dst - db);
638 }
639
640
641 static size_t
my_ll10tostr_mb2_or_mb4(const CHARSET_INFO * cs,char * dst,size_t len,int radix,longlong val)642 my_ll10tostr_mb2_or_mb4(const CHARSET_INFO *cs,
643 char *dst, size_t len, int radix, longlong val)
644 {
645 char buffer[65];
646 char *p, *db, *de;
647 long long_val;
648 int sl= 0;
649 ulonglong uval= (ulonglong) val;
650
651 if (radix < 0)
652 {
653 if (val < 0)
654 {
655 sl= 1;
656 /* Avoid integer overflow in (-val) for LLONG_MIN (BUG#31799). */
657 uval = (ulonglong)0 - uval;
658 }
659 }
660
661 p= &buffer[sizeof(buffer)-1];
662 *p='\0';
663
664 if (uval == 0)
665 {
666 *--p= '0';
667 goto cnv;
668 }
669
670 while (uval > (ulonglong) LONG_MAX)
671 {
672 ulonglong quo= uval/(uint) 10;
673 uint rem= (uint) (uval- quo* (uint) 10);
674 *--p= '0' + rem;
675 uval= quo;
676 }
677
678 long_val= (long) uval;
679 while (long_val != 0)
680 {
681 long quo= long_val/10;
682 *--p= (char) ('0' + (long_val - quo*10));
683 long_val= quo;
684 }
685
686 cnv:
687 if (sl)
688 {
689 *--p= '-';
690 }
691
692 for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
693 {
694 int cnvres= cs->cset->wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
695 if (cnvres > 0)
696 dst+= cnvres;
697 else
698 break;
699 }
700 return (int) (dst -db);
701 }
702
703 #endif /* HAVE_CHARSET_mb2_or_mb4 */
704
705
706 #ifdef HAVE_CHARSET_mb2
707 static longlong
my_strtoll10_mb2(const CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)708 my_strtoll10_mb2(const CHARSET_INFO *cs,
709 const char *nptr, char **endptr, int *error)
710 {
711 const char *s, *end, *start, *n_end, *true_end;
712 uchar c;
713 unsigned long i, j, k;
714 ulonglong li;
715 int negative;
716 ulong cutoff, cutoff2, cutoff3;
717 my_wc_t wc;
718 int res;
719
720 s= nptr;
721 /* If fixed length string */
722 if (endptr)
723 {
724 /*
725 Make sure string length is even.
726 Odd length indicates a bug in the caller.
727 Assert in debug, round in production.
728 */
729 assert((*endptr - s) % 2 == 0);
730 end= s + ((*endptr - s) / 2) * 2;
731
732 for ( ; ; ) /* Skip leading spaces and tabs */
733 {
734 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
735 if (res <= 0)
736 goto no_conv;
737 s+= res;
738 if (wc != ' ' && wc != '\t')
739 break;
740 }
741 }
742 else
743 {
744 /* We don't support null terminated strings in UCS2 */
745 goto no_conv;
746 }
747
748 /* Check for a sign. */
749 negative= 0;
750 if (wc == '-')
751 {
752 *error= -1; /* Mark as negative number */
753 negative= 1;
754 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
755 if (res <= 0)
756 goto no_conv;
757 s+= res;
758 cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
759 cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
760 cutoff3= MAX_NEGATIVE_NUMBER % 100;
761 }
762 else
763 {
764 *error= 0;
765 if (wc == '+')
766 {
767 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
768 if (res <= 0)
769 goto no_conv;
770 s+= res;
771 }
772 cutoff= ULONGLONG_MAX / LFACTOR2;
773 cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
774 cutoff3= ULONGLONG_MAX % 100;
775 }
776
777
778 /* Handle case where we have a lot of pre-zero */
779 if (wc == '0')
780 {
781 i= 0;
782 for ( ; ; s+= res)
783 {
784 if (s == end)
785 goto end_i; /* Return 0 */
786 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
787 if (res <= 0)
788 goto no_conv;
789 if (wc != '0')
790 break;
791 }
792 while (wc == '0');
793 n_end= s + 2 * INIT_CNT;
794 }
795 else
796 {
797 /* Read first digit to check that it's a valid number */
798 if ((c= (wc - '0')) > 9)
799 goto no_conv;
800 i= c;
801 n_end= s + 2 * (INIT_CNT-1);
802 }
803
804 /* Handle first 9 digits and store them in i */
805 if (n_end > end)
806 n_end= end;
807 for ( ; ; )
808 {
809 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) n_end);
810 if (res <= 0)
811 break;
812 s+= res;
813 if ((c= (wc - '0')) > 9)
814 goto end_i;
815 i= i*10+c;
816 }
817 if (s == end)
818 goto end_i;
819
820 /* Handle next 9 digits and store them in j */
821 j= 0;
822 start= s; /* Used to know how much to shift i */
823 n_end= true_end= s + 2 * INIT_CNT;
824 if (n_end > end)
825 n_end= end;
826 do
827 {
828 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
829 if (res <= 0)
830 goto no_conv;
831 s+= res;
832 if ((c= (wc - '0')) > 9)
833 goto end_i_and_j;
834 j= j*10+c;
835 } while (s != n_end);
836 if (s == end)
837 {
838 if (s != true_end)
839 goto end_i_and_j;
840 goto end3;
841 }
842 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
843 if (res <= 0)
844 goto no_conv;
845 s+= res;
846 if ((c= (wc - '0')) > 9)
847 goto end3;
848
849 /* Handle the next 1 or 2 digits and store them in k */
850 k=c;
851 if (s == end)
852 goto end4;
853 res= cs->cset->mb_wc(cs, &wc, (const uchar *) s, (const uchar *) end);
854 if (res <= 0)
855 goto no_conv;
856 s+= res;
857 if ((c= (wc - '0')) > 9)
858 goto end4;
859 k= k*10+c;
860 *endptr= (char*) s;
861
862 /* number string should have ended here */
863 if (s != end && (c= (wc - '0')) <= 9)
864 goto overflow;
865
866 /* Check that we didn't get an overflow with the last digit */
867 if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
868 k > cutoff3)))
869 goto overflow;
870 li=i*LFACTOR2+ (ulonglong) j*100 + k;
871 return (longlong) li;
872
873 overflow: /* *endptr is set here */
874 *error= MY_ERRNO_ERANGE;
875 return negative ? LLONG_MIN : (longlong) ULONGLONG_MAX;
876
877 end_i:
878 *endptr= (char*) s;
879 return (negative ? ((longlong) -(long) i) : (longlong) i);
880
881 end_i_and_j:
882 li= (ulonglong) i * lfactor[(size_t) (s-start) / 2] + j;
883 *endptr= (char*) s;
884 return (negative ? -((longlong) li) : (longlong) li);
885
886 end3:
887 li=(ulonglong) i*LFACTOR+ (ulonglong) j;
888 *endptr= (char*) s;
889 return (negative ? -((longlong) li) : (longlong) li);
890
891 end4:
892 li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
893 *endptr= (char*) s;
894 if (negative)
895 {
896 if (li > MAX_NEGATIVE_NUMBER)
897 goto overflow;
898 return -((longlong) li);
899 }
900 return (longlong) li;
901
902 no_conv:
903 /* There was no number to convert. */
904 *error= MY_ERRNO_EDOM;
905 *endptr= (char *) nptr;
906 return 0;
907 }
908
909
910 static size_t
my_scan_mb2(const CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)911 my_scan_mb2(const CHARSET_INFO *cs,
912 const char *str, const char *end, int sequence_type)
913 {
914 const char *str0= str;
915 my_wc_t wc;
916 int res;
917
918 switch (sequence_type)
919 {
920 case MY_SEQ_SPACES:
921 for (res= cs->cset->mb_wc(cs, &wc,
922 (const uchar *) str, (const uchar *) end);
923 res > 0 && wc == ' ';
924 str+= res,
925 res= cs->cset->mb_wc(cs, &wc,
926 (const uchar *) str, (const uchar *) end))
927 {
928 }
929 return (size_t) (str - str0);
930 default:
931 return 0;
932 }
933 }
934
935
936 static void
my_fill_mb2(const CHARSET_INFO * cs,char * s,size_t slen,int fill)937 my_fill_mb2(const CHARSET_INFO *cs, char *s, size_t slen, int fill)
938 {
939 char buf[10];
940 int buflen;
941
942 assert((slen % 2) == 0);
943
944 buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
945 (uchar*) buf + sizeof(buf));
946
947 assert(buflen > 0);
948
949 while (slen >= (size_t) buflen)
950 {
951 /* Enough space for the characer */
952 memcpy(s, buf, (size_t) buflen);
953 s+= buflen;
954 slen-= buflen;
955 }
956
957 /*
958 If there are some more space which is not enough
959 for the whole multibyte character, then add trailing zeros.
960 */
961 for ( ; slen; slen--)
962 {
963 *s++= 0x00;
964 }
965 }
966
967
968 static size_t
my_vsnprintf_mb2(char * dst,size_t n,const char * fmt,va_list ap)969 my_vsnprintf_mb2(char *dst, size_t n, const char* fmt, va_list ap)
970 {
971 char *start=dst, *end= dst + n - 1;
972 for (; *fmt ; fmt++)
973 {
974 if (fmt[0] != '%')
975 {
976 if (dst == end) /* End of buffer */
977 break;
978
979 *dst++='\0';
980 *dst++= *fmt; /* Copy ordinary char */
981 continue;
982 }
983
984 fmt++;
985
986 /* Skip if max size is used (to be compatible with printf) */
987 while ( (*fmt >= '0' && *fmt <= '9') || *fmt == '.' || *fmt == '-')
988 fmt++;
989
990 if (*fmt == 'l')
991 fmt++;
992
993 if (*fmt == 's') /* String parameter */
994 {
995 char *par= va_arg(ap, char *);
996 size_t plen;
997 size_t left_len= (size_t)(end-dst);
998 if (!par)
999 par= (char*) "(null)";
1000 plen= strlen(par);
1001 if (left_len <= plen * 2)
1002 plen = left_len / 2 - 1;
1003
1004 for ( ; plen ; plen--, dst+=2, par++)
1005 {
1006 dst[0]= '\0';
1007 dst[1]= par[0];
1008 }
1009 continue;
1010 }
1011 else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
1012 {
1013 int iarg;
1014 char nbuf[16];
1015 char *pbuf= nbuf;
1016
1017 if ((size_t) (end - dst) < 32)
1018 break;
1019 iarg= va_arg(ap, int);
1020 if (*fmt == 'd')
1021 int10_to_str((long) iarg, nbuf, -10);
1022 else
1023 int10_to_str((long) (uint) iarg, nbuf,10);
1024
1025 for (; pbuf[0]; pbuf++)
1026 {
1027 *dst++= '\0';
1028 *dst++= *pbuf;
1029 }
1030 continue;
1031 }
1032
1033 /* We come here on '%%', unknown code or too long parameter */
1034 if (dst == end)
1035 break;
1036 *dst++= '\0';
1037 *dst++= '%'; /* % used as % or unknown code */
1038 }
1039
1040 assert(dst <= end);
1041 *dst='\0'; /* End of errmessage */
1042 return (size_t) (dst - start);
1043 }
1044
1045
1046 static size_t
my_snprintf_mb2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * to,size_t n,const char * fmt,...)1047 my_snprintf_mb2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1048 char* to, size_t n, const char* fmt, ...)
1049 {
1050 size_t retval;
1051 va_list args;
1052 va_start(args,fmt);
1053 retval= my_vsnprintf_mb2(to, n, fmt, args);
1054 va_end(args);
1055 return retval;
1056 }
1057
1058
1059 static size_t
my_lengthsp_mb2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)1060 my_lengthsp_mb2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1061 const char *ptr, size_t length)
1062 {
1063 const char *end= ptr + length;
1064 while (end > ptr + 1 && end[-1] == ' ' && end[-2] == '\0')
1065 end-= 2;
1066 return (size_t) (end - ptr);
1067 }
1068
1069 #endif /* HAVE_CHARSET_mb2*/
1070
1071
1072
1073
1074 #ifdef HAVE_CHARSET_utf16
1075
1076 /*
1077 D800..DB7F - Non-provate surrogate high (896 pages)
1078 DB80..DBFF - Private surrogate high (128 pages)
1079 DC00..DFFF - Surrogate low (1024 codes in a page)
1080 */
1081 #define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800
1082 #define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF
1083 #define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00
1084 #define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF
1085
1086 #define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
1087 #define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
1088 #define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
1089
1090 #define MY_UTF16_WC2(a, b) ((a << 8) + b)
1091
1092 /*
1093 a= 110110?? (<< 18)
1094 b= ???????? (<< 10)
1095 c= 110111?? (<< 8)
1096 d= ???????? (<< 0)
1097 */
1098 #define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
1099 ((c & 3) << 8) + d + 0x10000)
1100
1101 static int
my_utf16_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1102 my_utf16_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1103 my_wc_t *pwc, const uchar *s, const uchar *e)
1104 {
1105 if (s + 2 > e)
1106 return MY_CS_TOOSMALL2;
1107
1108 /*
1109 High bytes: 0xD[89AB] = B'110110??'
1110 Low bytes: 0xD[CDEF] = B'110111??'
1111 Surrogate mask: 0xFC = B'11111100'
1112 */
1113
1114 if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */
1115 {
1116 if (s + 4 > e)
1117 return MY_CS_TOOSMALL4;
1118
1119 if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */
1120 return MY_CS_ILSEQ;
1121
1122 *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]);
1123 return 4;
1124 }
1125
1126 if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */
1127 return MY_CS_ILSEQ;
1128
1129 *pwc= MY_UTF16_WC2(s[0], s[1]);
1130 return 2;
1131 }
1132
1133
1134 static int
my_uni_utf16(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1135 my_uni_utf16(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1136 my_wc_t wc, uchar *s, uchar *e)
1137 {
1138 if (wc <= 0xFFFF)
1139 {
1140 if (s + 2 > e)
1141 return MY_CS_TOOSMALL2;
1142 if (MY_UTF16_SURROGATE(wc))
1143 return MY_CS_ILUNI;
1144 *s++= (uchar) (wc >> 8);
1145 *s= (uchar) (wc & 0xFF);
1146 return 2;
1147 }
1148
1149 if (wc <= 0x10FFFF)
1150 {
1151 if (s + 4 > e)
1152 return MY_CS_TOOSMALL4;
1153 *s++= (uchar) ((wc-= 0x10000) >> 18) | 0xD8;
1154 *s++= (uchar) (wc >> 10) & 0xFF;
1155 *s++= (uchar) ((wc >> 8) & 3) | 0xDC;
1156 *s= (uchar) wc & 0xFF;
1157 return 4;
1158 }
1159
1160 return MY_CS_ILUNI;
1161 }
1162
1163
1164 static inline void
my_tolower_utf16(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1165 my_tolower_utf16(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1166 {
1167 const MY_UNICASE_CHARACTER *page;
1168 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1169 *wc= page[*wc & 0xFF].tolower;
1170 }
1171
1172
1173 static inline void
my_toupper_utf16(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1174 my_toupper_utf16(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1175 {
1176 const MY_UNICASE_CHARACTER *page;
1177 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1178 *wc= page[*wc & 0xFF].toupper;
1179 }
1180
1181
1182 static inline void
my_tosort_utf16(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1183 my_tosort_utf16(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1184 {
1185 if (*wc <= uni_plane->maxchar)
1186 {
1187 const MY_UNICASE_CHARACTER *page;
1188 if ((page= uni_plane->page[*wc >> 8]))
1189 *wc= page[*wc & 0xFF].sort;
1190 }
1191 else
1192 {
1193 *wc= MY_CS_REPLACEMENT_CHARACTER;
1194 }
1195 }
1196
1197
1198
1199 static size_t
my_caseup_utf16(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1200 my_caseup_utf16(const CHARSET_INFO *cs, char *src, size_t srclen,
1201 char *dst MY_ATTRIBUTE((unused)),
1202 size_t dstlen MY_ATTRIBUTE((unused)))
1203 {
1204 my_wc_t wc;
1205 int res;
1206 char *srcend= src + srclen;
1207 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1208 assert(src == dst && srclen == dstlen);
1209
1210 while ((src < srcend) &&
1211 (res= cs->cset->mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1212 {
1213 my_toupper_utf16(uni_plane, &wc);
1214 if (res != cs->cset->wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
1215 break;
1216 src+= res;
1217 }
1218 return srclen;
1219 }
1220
1221
1222 static void
my_hash_sort_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)1223 my_hash_sort_utf16(const CHARSET_INFO *cs, const uchar *s, size_t slen,
1224 ulong *n1, ulong *n2)
1225 {
1226 my_wc_t wc;
1227 int res;
1228 const uchar *e= s + cs->cset->lengthsp(cs, (const char *) s, slen);
1229 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1230 ulong tmp1;
1231 ulong tmp2;
1232
1233 tmp1= *n1;
1234 tmp2= *n2;
1235
1236 while ((s < e) && (res= cs->cset->mb_wc(cs, &wc,
1237 (uchar *) s, (uchar *) e)) > 0)
1238 {
1239 my_tosort_utf16(uni_plane, &wc);
1240 tmp1^= (((tmp1 & 63) + tmp2) * (wc & 0xFF)) + (tmp1 << 8);
1241 tmp2+= 3;
1242 tmp1^= (((tmp1 & 63) + tmp2) * (wc >> 8)) + (tmp1 << 8);
1243 tmp2+= 3;
1244 s+= res;
1245 }
1246
1247 *n1= tmp1;
1248 *n2= tmp2;
1249 }
1250
1251
1252 static size_t
my_casedn_utf16(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))1253 my_casedn_utf16(const CHARSET_INFO *cs, char *src, size_t srclen,
1254 char *dst MY_ATTRIBUTE((unused)),
1255 size_t dstlen MY_ATTRIBUTE((unused)))
1256 {
1257 my_wc_t wc;
1258 int res;
1259 char *srcend= src + srclen;
1260 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1261 assert(src == dst && srclen == dstlen);
1262
1263 while ((src < srcend) &&
1264 (res= cs->cset->mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1265 {
1266 my_tolower_utf16(uni_plane, &wc);
1267 if (res != cs->cset->wc_mb(cs, wc, (uchar *) src, (uchar *) srcend))
1268 break;
1269 src+= res;
1270 }
1271 return srclen;
1272 }
1273
1274
1275 static int
my_strnncoll_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)1276 my_strnncoll_utf16(const CHARSET_INFO *cs,
1277 const uchar *s, size_t slen,
1278 const uchar *t, size_t tlen,
1279 my_bool t_is_prefix)
1280 {
1281 int s_res, t_res;
1282 my_wc_t s_wc= 0, t_wc= 0;
1283 const uchar *se= s + slen;
1284 const uchar *te= t + tlen;
1285 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1286
1287 while (s < se && t < te)
1288 {
1289 s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1290 t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1291
1292 if (s_res <= 0 || t_res <= 0)
1293 {
1294 /* Incorrect string, compare by char value */
1295 return my_bincmp(s, se, t, te);
1296 }
1297
1298 my_tosort_utf16(uni_plane, &s_wc);
1299 my_tosort_utf16(uni_plane, &t_wc);
1300
1301 if (s_wc != t_wc)
1302 {
1303 return s_wc > t_wc ? 1 : -1;
1304 }
1305
1306 s+= s_res;
1307 t+= t_res;
1308 }
1309 return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
1310 }
1311
1312
1313 /**
1314 Compare strings, discarding end space
1315
1316 If one string is shorter as the other, then we space extend the other
1317 so that the strings have equal length.
1318
1319 This will ensure that the following things hold:
1320
1321 "a" == "a "
1322 "a\0" < "a"
1323 "a\0" < "a "
1324
1325 @param cs Character set pinter.
1326 @param a First string to compare.
1327 @param a_length Length of 'a'.
1328 @param b Second string to compare.
1329 @param b_length Length of 'b'.
1330
1331 IMPLEMENTATION
1332
1333 @return Comparison result.
1334 @retval Negative number, if a less than b.
1335 @retval 0, if a is equal to b
1336 @retval Positive number, if a > b
1337 */
1338
1339 static int
my_strnncollsp_utf16(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)1340 my_strnncollsp_utf16(const CHARSET_INFO *cs,
1341 const uchar *s, size_t slen,
1342 const uchar *t, size_t tlen,
1343 my_bool diff_if_only_endspace_difference)
1344 {
1345 int res;
1346 my_wc_t s_wc= 0, t_wc= 0;
1347 const uchar *se= s + slen, *te= t + tlen;
1348 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1349
1350 assert((slen % 2) == 0);
1351 assert((tlen % 2) == 0);
1352
1353 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
1354 diff_if_only_endspace_difference= FALSE;
1355 #endif
1356
1357 while (s < se && t < te)
1358 {
1359 int s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1360 int t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1361
1362 if (s_res <= 0 || t_res <= 0)
1363 {
1364 /* Incorrect string, compare bytewise */
1365 return my_bincmp(s, se, t, te);
1366 }
1367
1368 my_tosort_utf16(uni_plane, &s_wc);
1369 my_tosort_utf16(uni_plane, &t_wc);
1370
1371 if (s_wc != t_wc)
1372 {
1373 return s_wc > t_wc ? 1 : -1;
1374 }
1375
1376 s+= s_res;
1377 t+= t_res;
1378 }
1379
1380 slen= (size_t) (se - s);
1381 tlen= (size_t) (te - t);
1382 res= 0;
1383
1384 if (slen != tlen)
1385 {
1386 int s_res, swap= 1;
1387 if (diff_if_only_endspace_difference)
1388 res= 1; /* Assume 's' is bigger */
1389 if (slen < tlen)
1390 {
1391 slen= tlen;
1392 s= t;
1393 se= te;
1394 swap= -1;
1395 res= -res;
1396 }
1397
1398 for ( ; s < se; s+= s_res)
1399 {
1400 if ((s_res= cs->cset->mb_wc(cs, &s_wc, s, se)) <= 0)
1401 {
1402 return 0;
1403 }
1404 if (s_wc != ' ')
1405 return (s_wc < ' ') ? -swap : swap;
1406 }
1407 }
1408 return res;
1409 }
1410
1411
1412 static uint
my_ismbchar_utf16(const CHARSET_INFO * cs,const char * b,const char * e)1413 my_ismbchar_utf16(const CHARSET_INFO *cs, const char *b, const char *e)
1414 {
1415 my_wc_t wc;
1416 int res= cs->cset->mb_wc(cs, &wc, (const uchar *) b, (const uchar *) e);
1417 return (uint) (res > 0 ? res : 0);
1418 }
1419
1420
1421 static uint
my_mbcharlen_utf16(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))1422 my_mbcharlen_utf16(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1423 uint c MY_ATTRIBUTE((unused)))
1424 {
1425 assert(0);
1426 return MY_UTF16_HIGH_HEAD(c) ? 4 : 2;
1427 }
1428
1429
1430 static size_t
my_numchars_utf16(const CHARSET_INFO * cs,const char * b,const char * e)1431 my_numchars_utf16(const CHARSET_INFO *cs,
1432 const char *b, const char *e)
1433 {
1434 size_t nchars= 0;
1435 for ( ; ; nchars++)
1436 {
1437 size_t charlen= my_ismbchar_utf16(cs, b, e);
1438 if (!charlen)
1439 break;
1440 b+= charlen;
1441 }
1442 return nchars;
1443 }
1444
1445
1446 static size_t
my_charpos_utf16(const CHARSET_INFO * cs,const char * b,const char * e,size_t pos)1447 my_charpos_utf16(const CHARSET_INFO *cs,
1448 const char *b, const char *e, size_t pos)
1449 {
1450 const char *b0= b;
1451 uint charlen;
1452
1453 for ( ; pos; b+= charlen, pos--)
1454 {
1455 if (!(charlen= my_ismbchar(cs, b, e)))
1456 return (e + 2 - b0); /* Error, return pos outside the string */
1457 }
1458 return (size_t) (pos ? (e + 2 - b0) : (b - b0));
1459 }
1460
1461
1462 static size_t
my_well_formed_len_utf16(const CHARSET_INFO * cs,const char * b,const char * e,size_t nchars,int * error)1463 my_well_formed_len_utf16(const CHARSET_INFO *cs,
1464 const char *b, const char *e,
1465 size_t nchars, int *error)
1466 {
1467 const char *b0= b;
1468 uint charlen;
1469 *error= 0;
1470
1471 for ( ; nchars; b+= charlen, nchars--)
1472 {
1473 if (!(charlen= my_ismbchar(cs, b, e)))
1474 {
1475 *error= b < e ? 1 : 0;
1476 break;
1477 }
1478 }
1479 return (size_t) (b - b0);
1480 }
1481
1482
1483 static int
my_wildcmp_utf16_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1484 my_wildcmp_utf16_ci(const CHARSET_INFO *cs,
1485 const char *str,const char *str_end,
1486 const char *wildstr,const char *wildend,
1487 int escape, int w_one, int w_many)
1488 {
1489 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1490 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1491 escape, w_one, w_many, uni_plane);
1492 }
1493
1494
1495 static int
my_wildcmp_utf16_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1496 my_wildcmp_utf16_bin(const CHARSET_INFO *cs,
1497 const char *str,const char *str_end,
1498 const char *wildstr,const char *wildend,
1499 int escape, int w_one, int w_many)
1500 {
1501 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1502 escape, w_one, w_many, NULL);
1503 }
1504
1505
1506 static int
my_strnncoll_utf16_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)1507 my_strnncoll_utf16_bin(const CHARSET_INFO *cs,
1508 const uchar *s, size_t slen,
1509 const uchar *t, size_t tlen,
1510 my_bool t_is_prefix)
1511 {
1512 int s_res,t_res;
1513 my_wc_t s_wc= 0, t_wc= 0;
1514 const uchar *se=s+slen;
1515 const uchar *te=t+tlen;
1516
1517 while ( s < se && t < te )
1518 {
1519 s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1520 t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1521
1522 if (s_res <= 0 || t_res <= 0)
1523 {
1524 /* Incorrect string, compare by char value */
1525 return my_bincmp(s, se, t, te);
1526 }
1527 if (s_wc != t_wc)
1528 {
1529 return s_wc > t_wc ? 1 : -1;
1530 }
1531
1532 s+= s_res;
1533 t+= t_res;
1534 }
1535 return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
1536 }
1537
1538
1539 static int
my_strnncollsp_utf16_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)1540 my_strnncollsp_utf16_bin(const CHARSET_INFO *cs,
1541 const uchar *s, size_t slen,
1542 const uchar *t, size_t tlen,
1543 my_bool diff_if_only_endspace_difference)
1544 {
1545 int res;
1546 my_wc_t s_wc= 0, t_wc= 0;
1547 const uchar *se= s + slen, *te= t + tlen;
1548
1549 assert((slen % 2) == 0);
1550 assert((tlen % 2) == 0);
1551
1552 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
1553 diff_if_only_endspace_difference= FALSE;
1554 #endif
1555
1556 while (s < se && t < te)
1557 {
1558 int s_res= cs->cset->mb_wc(cs, &s_wc, s, se);
1559 int t_res= cs->cset->mb_wc(cs, &t_wc, t, te);
1560
1561 if (s_res <= 0 || t_res <= 0)
1562 {
1563 /* Incorrect string, compare bytewise */
1564 return my_bincmp(s, se, t, te);
1565 }
1566
1567 if (s_wc != t_wc)
1568 {
1569 return s_wc > t_wc ? 1 : -1;
1570 }
1571
1572 s+= s_res;
1573 t+= t_res;
1574 }
1575
1576 slen= (size_t) (se - s);
1577 tlen= (size_t) (te - t);
1578 res= 0;
1579
1580 if (slen != tlen)
1581 {
1582 int s_res, swap= 1;
1583 if (diff_if_only_endspace_difference)
1584 res= 1; /* Assume 's' is bigger */
1585 if (slen < tlen)
1586 {
1587 slen= tlen;
1588 s= t;
1589 se= te;
1590 swap= -1;
1591 res= -res;
1592 }
1593
1594 for ( ; s < se; s+= s_res)
1595 {
1596 if ((s_res= cs->cset->mb_wc(cs, &s_wc, s, se)) <= 0)
1597 {
1598 return 0;
1599 }
1600 if (s_wc != ' ')
1601 return (s_wc < ' ') ? -swap : swap;
1602 }
1603 }
1604 return res;
1605 }
1606
1607
1608 static void
my_hash_sort_utf16_bin(const CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1609 my_hash_sort_utf16_bin(const CHARSET_INFO *cs,
1610 const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
1611 {
1612 const uchar *end= pos + cs->cset->lengthsp(cs, (const char *) pos, len);
1613 ulong tmp1;
1614 ulong tmp2;
1615
1616 tmp1= *nr1;
1617 tmp2= *nr2;
1618
1619 for ( ; pos < end ; pos++)
1620 {
1621 tmp1^= (ulong) ((((uint) tmp1 & 63) + tmp2) *
1622 ((uint)*pos)) + (tmp1 << 8);
1623 tmp2+= 3;
1624 }
1625
1626 *nr1= tmp1;
1627 *nr2= tmp2;
1628 }
1629
1630
1631 static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
1632 {
1633 NULL, /* init */
1634 my_strnncoll_utf16,
1635 my_strnncollsp_utf16,
1636 my_strnxfrm_unicode,
1637 my_strnxfrmlen_simple,
1638 my_like_range_generic,
1639 my_wildcmp_utf16_ci,
1640 my_strcasecmp_mb2_or_mb4,
1641 my_instr_mb,
1642 my_hash_sort_utf16,
1643 my_propagate_simple
1644 };
1645
1646
1647 static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
1648 {
1649 NULL, /* init */
1650 my_strnncoll_utf16_bin,
1651 my_strnncollsp_utf16_bin,
1652 my_strnxfrm_unicode_full_bin,
1653 my_strnxfrmlen_unicode_full_bin,
1654 my_like_range_generic,
1655 my_wildcmp_utf16_bin,
1656 my_strcasecmp_mb2_or_mb4,
1657 my_instr_mb,
1658 my_hash_sort_utf16_bin,
1659 my_propagate_simple
1660 };
1661
1662
1663 MY_CHARSET_HANDLER my_charset_utf16_handler=
1664 {
1665 NULL, /* init */
1666 my_ismbchar_utf16, /* ismbchar */
1667 my_mbcharlen_utf16, /* mbcharlen */
1668 my_numchars_utf16,
1669 my_charpos_utf16,
1670 my_well_formed_len_utf16,
1671 my_lengthsp_mb2,
1672 my_numcells_mb,
1673 my_utf16_uni, /* mb_wc */
1674 my_uni_utf16, /* wc_mb */
1675 my_mb_ctype_mb,
1676 my_caseup_str_mb2_or_mb4,
1677 my_casedn_str_mb2_or_mb4,
1678 my_caseup_utf16,
1679 my_casedn_utf16,
1680 my_snprintf_mb2,
1681 my_l10tostr_mb2_or_mb4,
1682 my_ll10tostr_mb2_or_mb4,
1683 my_fill_mb2,
1684 my_strntol_mb2_or_mb4,
1685 my_strntoul_mb2_or_mb4,
1686 my_strntoll_mb2_or_mb4,
1687 my_strntoull_mb2_or_mb4,
1688 my_strntod_mb2_or_mb4,
1689 my_strtoll10_mb2,
1690 my_strntoull10rnd_mb2_or_mb4,
1691 my_scan_mb2
1692 };
1693
1694
1695 CHARSET_INFO my_charset_utf16_general_ci=
1696 {
1697 54,0,0, /* number */
1698 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1699 "utf16", /* cs name */
1700 "utf16_general_ci", /* name */
1701 "UTF-16 Unicode", /* comment */
1702 NULL, /* tailoring */
1703 NULL, /* ctype */
1704 NULL, /* to_lower */
1705 NULL, /* to_upper */
1706 NULL, /* sort_order */
1707 NULL, /* uca */
1708 NULL, /* tab_to_uni */
1709 NULL, /* tab_from_uni */
1710 &my_unicase_default, /* caseinfo */
1711 NULL, /* state_map */
1712 NULL, /* ident_map */
1713 1, /* strxfrm_multiply */
1714 1, /* caseup_multiply */
1715 1, /* casedn_multiply */
1716 2, /* mbminlen */
1717 4, /* mbmaxlen */
1718 1, /* mbmaxlenlen */
1719 0, /* min_sort_char */
1720 0xFFFF, /* max_sort_char */
1721 ' ', /* pad char */
1722 0, /* escape_with_backslash_is_dangerous */
1723 1, /* levels_for_compare */
1724 1, /* levels_for_order */
1725 &my_charset_utf16_handler,
1726 &my_collation_utf16_general_ci_handler
1727 };
1728
1729
1730 CHARSET_INFO my_charset_utf16_bin=
1731 {
1732 55,0,0, /* number */
1733 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1734 "utf16", /* cs name */
1735 "utf16_bin", /* name */
1736 "UTF-16 Unicode", /* comment */
1737 NULL, /* tailoring */
1738 NULL, /* ctype */
1739 NULL, /* to_lower */
1740 NULL, /* to_upper */
1741 NULL, /* sort_order */
1742 NULL, /* uca */
1743 NULL, /* tab_to_uni */
1744 NULL, /* tab_from_uni */
1745 &my_unicase_default, /* caseinfo */
1746 NULL, /* state_map */
1747 NULL, /* ident_map */
1748 1, /* strxfrm_multiply */
1749 1, /* caseup_multiply */
1750 1, /* casedn_multiply */
1751 2, /* mbminlen */
1752 4, /* mbmaxlen */
1753 1, /* mbmaxlenlen */
1754 0, /* min_sort_char */
1755 0xFFFF, /* max_sort_char */
1756 ' ', /* pad char */
1757 0, /* escape_with_backslash_is_dangerous */
1758 1, /* levels_for_compare */
1759 1, /* levels_for_order */
1760 &my_charset_utf16_handler,
1761 &my_collation_utf16_bin_handler
1762 };
1763
1764
1765 static int
my_utf16le_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1766 my_utf16le_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1767 my_wc_t *pwc, const uchar *s, const uchar *e)
1768 {
1769 my_wc_t lo;
1770
1771 if (s + 2 > e)
1772 return MY_CS_TOOSMALL2;
1773
1774 if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
1775 (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
1776 return 2; /* [0000-D7FF,E000-FFFF] */
1777
1778 if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
1779 return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
1780
1781 if (s + 4 > e)
1782 return MY_CS_TOOSMALL4;
1783
1784 s+= 2;
1785
1786 if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
1787 lo > MY_UTF16_SURROGATE_LOW_LAST)
1788 return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
1789
1790 *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
1791 return 4;
1792 }
1793
1794
1795 static int
my_uni_utf16le(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1796 my_uni_utf16le(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1797 my_wc_t wc, uchar *s, uchar *e)
1798 {
1799 if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
1800 (wc > MY_UTF16_SURROGATE_LOW_LAST &&
1801 wc <= 0xFFFF))
1802 {
1803 if (s + 2 > e)
1804 return MY_CS_TOOSMALL2;
1805 int2store(s, (uint16)wc);
1806 return 2; /* [0000-D7FF,E000-FFFF] */
1807 }
1808
1809 if (wc < 0xFFFF || wc > 0x10FFFF)
1810 return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
1811
1812 if (s + 4 > e)
1813 return MY_CS_TOOSMALL4;
1814
1815 wc-= 0x10000;
1816 int2store(s, (0xD800 | ((wc >> 10) & 0x3FF))); s+= 2;
1817 int2store(s, (0xDC00 | (wc & 0x3FF)));
1818 return 4; /* [010000-10FFFF] */
1819 }
1820
1821
1822 static size_t
my_lengthsp_utf16le(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)1823 my_lengthsp_utf16le(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1824 const char *ptr, size_t length)
1825 {
1826 const char *end= ptr + length;
1827 while (end > ptr + 1 && uint2korr((uchar*) end - 2) == 0x20)
1828 end-= 2;
1829 return (size_t) (end - ptr);
1830 }
1831
1832
1833 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
1834 {
1835 NULL, /* init */
1836 my_ismbchar_utf16,
1837 my_mbcharlen_utf16,
1838 my_numchars_utf16,
1839 my_charpos_utf16,
1840 my_well_formed_len_utf16,
1841 my_lengthsp_utf16le,
1842 my_numcells_mb,
1843 my_utf16le_uni, /* mb_wc */
1844 my_uni_utf16le, /* wc_mb */
1845 my_mb_ctype_mb,
1846 my_caseup_str_mb2_or_mb4,
1847 my_casedn_str_mb2_or_mb4,
1848 my_caseup_utf16,
1849 my_casedn_utf16,
1850 my_snprintf_mb2,
1851 my_l10tostr_mb2_or_mb4,
1852 my_ll10tostr_mb2_or_mb4,
1853 my_fill_mb2,
1854 my_strntol_mb2_or_mb4,
1855 my_strntoul_mb2_or_mb4,
1856 my_strntoll_mb2_or_mb4,
1857 my_strntoull_mb2_or_mb4,
1858 my_strntod_mb2_or_mb4,
1859 my_strtoll10_mb2,
1860 my_strntoull10rnd_mb2_or_mb4,
1861 my_scan_mb2
1862 };
1863
1864
1865 CHARSET_INFO my_charset_utf16le_general_ci=
1866 {
1867 56,0,0, /* number */
1868 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1869 "utf16le", /* cs name */
1870 "utf16le_general_ci",/* name */
1871 "UTF-16LE Unicode", /* comment */
1872 NULL, /* tailoring */
1873 NULL, /* ctype */
1874 NULL, /* to_lower */
1875 NULL, /* to_upper */
1876 NULL, /* sort_order */
1877 NULL, /* uca */
1878 NULL, /* tab_to_uni */
1879 NULL, /* tab_from_uni */
1880 &my_unicase_default, /* caseinfo */
1881 NULL, /* state_map */
1882 NULL, /* ident_map */
1883 1, /* strxfrm_multiply */
1884 1, /* caseup_multiply */
1885 1, /* casedn_multiply */
1886 2, /* mbminlen */
1887 4, /* mbmaxlen */
1888 1, /* mbmaxlenlen */
1889 0, /* min_sort_char */
1890 0xFFFF, /* max_sort_char */
1891 ' ', /* pad char */
1892 0, /* escape_with_backslash_is_dangerous */
1893 1, /* levels_for_compare */
1894 1, /* levels_for_order */
1895 &my_charset_utf16le_handler,
1896 &my_collation_utf16_general_ci_handler
1897 };
1898
1899
1900 CHARSET_INFO my_charset_utf16le_bin=
1901 {
1902 62,0,0, /* number */
1903 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1904 "utf16le", /* cs name */
1905 "utf16le_bin", /* name */
1906 "UTF-16LE Unicode", /* comment */
1907 NULL, /* tailoring */
1908 NULL, /* ctype */
1909 NULL, /* to_lower */
1910 NULL, /* to_upper */
1911 NULL, /* sort_order */
1912 NULL, /* uca */
1913 NULL, /* tab_to_uni */
1914 NULL, /* tab_from_uni */
1915 &my_unicase_default, /* caseinfo */
1916 NULL, /* state_map */
1917 NULL, /* ident_map */
1918 1, /* strxfrm_multiply */
1919 1, /* caseup_multiply */
1920 1, /* casedn_multiply */
1921 2, /* mbminlen */
1922 4, /* mbmaxlen */
1923 1, /* mbmaxlenlen */
1924 0, /* min_sort_char */
1925 0xFFFF, /* max_sort_char */
1926 ' ', /* pad char */
1927 0, /* escape_with_backslash_is_dangerous */
1928 1, /* levels_for_compare */
1929 1, /* levels_for_order */
1930 &my_charset_utf16le_handler,
1931 &my_collation_utf16_bin_handler
1932 };
1933
1934
1935 #endif /* HAVE_CHARSET_utf16 */
1936
1937
1938 #ifdef HAVE_CHARSET_utf32
1939
1940 static int
my_utf32_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)1941 my_utf32_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1942 my_wc_t *pwc, const uchar *s, const uchar *e)
1943 {
1944 if (s + 4 > e)
1945 return MY_CS_TOOSMALL4;
1946 *pwc= (((my_wc_t)s[0]) << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
1947 return 4;
1948 }
1949
1950
1951 static int
my_uni_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * s,uchar * e)1952 my_uni_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
1953 my_wc_t wc, uchar *s, uchar *e)
1954 {
1955 if (s + 4 > e)
1956 return MY_CS_TOOSMALL4;
1957
1958 s[0]= (uchar) (wc >> 24);
1959 s[1]= (uchar) (wc >> 16) & 0xFF;
1960 s[2]= (uchar) (wc >> 8) & 0xFF;
1961 s[3]= (uchar) wc & 0xFF;
1962 return 4;
1963 }
1964
1965
1966 static inline void
my_tolower_utf32(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1967 my_tolower_utf32(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1968 {
1969 const MY_UNICASE_CHARACTER *page;
1970 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1971 *wc= page[*wc & 0xFF].tolower;
1972 }
1973
1974
1975 static inline void
my_toupper_utf32(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1976 my_toupper_utf32(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1977 {
1978 const MY_UNICASE_CHARACTER *page;
1979 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1980 *wc= page[*wc & 0xFF].toupper;
1981 }
1982
1983
1984 static inline void
my_tosort_utf32(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1985 my_tosort_utf32(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1986 {
1987 if (*wc <= uni_plane->maxchar)
1988 {
1989 const MY_UNICASE_CHARACTER *page;
1990 if ((page= uni_plane->page[*wc >> 8]))
1991 *wc= page[*wc & 0xFF].sort;
1992 }
1993 else
1994 {
1995 *wc= MY_CS_REPLACEMENT_CHARACTER;
1996 }
1997 }
1998
1999
2000 static size_t
my_caseup_utf32(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2001 my_caseup_utf32(const CHARSET_INFO *cs, char *src, size_t srclen,
2002 char *dst MY_ATTRIBUTE((unused)),
2003 size_t dstlen MY_ATTRIBUTE((unused)))
2004 {
2005 my_wc_t wc;
2006 int res;
2007 char *srcend= src + srclen;
2008 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2009 assert(src == dst && srclen == dstlen);
2010
2011 while ((src < srcend) &&
2012 (res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
2013 {
2014 my_toupper_utf32(uni_plane, &wc);
2015 if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
2016 break;
2017 src+= res;
2018 }
2019 return srclen;
2020 }
2021
2022
2023 static void
my_hash_sort_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)2024 my_hash_sort_utf32(const CHARSET_INFO *cs, const uchar *s, size_t slen,
2025 ulong *n1, ulong *n2)
2026 {
2027 my_wc_t wc;
2028 int res;
2029 const uchar *e= s + slen;
2030 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2031 ulong tmp1;
2032 ulong tmp2;
2033 uint ch;
2034
2035 /* Skip trailing spaces */
2036 while (e > s + 3 && e[-1] == ' ' && !e[-2] && !e[-3] && !e[-4])
2037 e-= 4;
2038
2039 tmp1= *n1;
2040 tmp2= *n2;
2041
2042 while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
2043 {
2044 my_tosort_utf32(uni_plane, &wc);
2045
2046 ch= (wc >> 24);
2047 tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2048 tmp2+= 3;
2049
2050 ch= (wc >> 16) & 0xFF;
2051 tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2052 tmp2+= 3;
2053
2054 ch= (wc >> 8) & 0xFF;
2055 tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2056 tmp2+= 3;
2057
2058 ch= (wc & 0xFF);
2059 tmp1^= (((tmp1 & 63) + tmp2) * ch) + (tmp1 << 8);
2060 tmp2+= 3;
2061
2062 s+= res;
2063 }
2064
2065 *n1= tmp1;
2066 *n2= tmp2;
2067 }
2068
2069
2070 static size_t
my_casedn_utf32(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2071 my_casedn_utf32(const CHARSET_INFO *cs, char *src, size_t srclen,
2072 char *dst MY_ATTRIBUTE((unused)),
2073 size_t dstlen MY_ATTRIBUTE((unused)))
2074 {
2075 my_wc_t wc;
2076 int res;
2077 char *srcend= src + srclen;
2078 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2079 assert(src == dst && srclen == dstlen);
2080
2081 while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
2082 {
2083 my_tolower_utf32(uni_plane,&wc);
2084 if (res != my_uni_utf32(cs, wc, (uchar*) src, (uchar*) srcend))
2085 break;
2086 src+= res;
2087 }
2088 return srclen;
2089 }
2090
2091
2092 static int
my_strnncoll_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)2093 my_strnncoll_utf32(const CHARSET_INFO *cs,
2094 const uchar *s, size_t slen,
2095 const uchar *t, size_t tlen,
2096 my_bool t_is_prefix)
2097 {
2098 my_wc_t s_wc= 0, t_wc= 0;
2099 const uchar *se= s + slen;
2100 const uchar *te= t + tlen;
2101 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2102
2103 while (s < se && t < te)
2104 {
2105 int s_res= my_utf32_uni(cs, &s_wc, s, se);
2106 int t_res= my_utf32_uni(cs, &t_wc, t, te);
2107
2108 if ( s_res <= 0 || t_res <= 0)
2109 {
2110 /* Incorrect string, compare by char value */
2111 return my_bincmp(s, se, t, te);
2112 }
2113
2114 my_tosort_utf32(uni_plane, &s_wc);
2115 my_tosort_utf32(uni_plane, &t_wc);
2116
2117 if (s_wc != t_wc)
2118 {
2119 return s_wc > t_wc ? 1 : -1;
2120 }
2121
2122 s+= s_res;
2123 t+= t_res;
2124 }
2125 return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
2126 }
2127
2128
2129 /**
2130 Compare strings, discarding end space
2131
2132 If one string is shorter as the other, then we space extend the other
2133 so that the strings have equal length.
2134
2135 This will ensure that the following things hold:
2136
2137 "a" == "a "
2138 "a\0" < "a"
2139 "a\0" < "a "
2140
2141 @param cs Character set pinter.
2142 @param a First string to compare.
2143 @param a_length Length of 'a'.
2144 @param b Second string to compare.
2145 @param b_length Length of 'b'.
2146
2147 IMPLEMENTATION
2148
2149 @return Comparison result.
2150 @retval Negative number, if a less than b.
2151 @retval 0, if a is equal to b
2152 @retval Positive number, if a > b
2153 */
2154
2155
2156 static int
my_strnncollsp_utf32(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference)2157 my_strnncollsp_utf32(const CHARSET_INFO *cs,
2158 const uchar *s, size_t slen,
2159 const uchar *t, size_t tlen,
2160 my_bool diff_if_only_endspace_difference)
2161 {
2162 int res;
2163 my_wc_t s_wc= 0, t_wc= 0;
2164 const uchar *se= s + slen, *te= t + tlen;
2165 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2166
2167 assert((slen % 4) == 0);
2168 assert((tlen % 4) == 0);
2169
2170 #ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
2171 diff_if_only_endspace_difference= FALSE;
2172 #endif
2173
2174 while ( s < se && t < te )
2175 {
2176 int s_res= my_utf32_uni(cs, &s_wc, s, se);
2177 int t_res= my_utf32_uni(cs, &t_wc, t, te);
2178
2179 if ( s_res <= 0 || t_res <= 0 )
2180 {
2181 /* Incorrect string, compare bytewise */
2182 return my_bincmp(s, se, t, te);
2183 }
2184
2185 my_tosort_utf32(uni_plane, &s_wc);
2186 my_tosort_utf32(uni_plane, &t_wc);
2187
2188 if ( s_wc != t_wc )
2189 {
2190 return s_wc > t_wc ? 1 : -1;
2191 }
2192
2193 s+= s_res;
2194 t+= t_res;
2195 }
2196
2197 slen= (size_t) (se - s);
2198 tlen= (size_t) (te - t);
2199 res= 0;
2200
2201 if (slen != tlen)
2202 {
2203 int s_res, swap= 1;
2204 if (diff_if_only_endspace_difference)
2205 res= 1; /* Assume 's' is bigger */
2206 if (slen < tlen)
2207 {
2208 slen= tlen;
2209 s= t;
2210 se= te;
2211 swap= -1;
2212 res= -res;
2213 }
2214
2215 for ( ; s < se; s+= s_res)
2216 {
2217 if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0)
2218 {
2219 assert(0);
2220 return 0;
2221 }
2222 if (s_wc != ' ')
2223 return (s_wc < ' ') ? -swap : swap;
2224 }
2225 }
2226 return res;
2227 }
2228
2229
2230 static size_t
my_strnxfrmlen_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),size_t len)2231 my_strnxfrmlen_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2232 size_t len)
2233 {
2234 return len / 2;
2235 }
2236
2237
2238 static uint
my_ismbchar_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)))2239 my_ismbchar_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2240 const char *b MY_ATTRIBUTE((unused)),
2241 const char *e MY_ATTRIBUTE((unused)))
2242 {
2243 return 4;
2244 }
2245
2246
2247 static uint
my_mbcharlen_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))2248 my_mbcharlen_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
2249 uint c MY_ATTRIBUTE((unused)))
2250 {
2251 return 4;
2252 }
2253
2254
2255 static size_t
my_vsnprintf_utf32(char * dst,size_t n,const char * fmt,va_list ap)2256 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
2257 {
2258 char *start= dst, *end= dst + n;
2259 assert((n % 4) == 0);
2260 for (; *fmt ; fmt++)
2261 {
2262 if (fmt[0] != '%')
2263 {
2264 if (dst >= end) /* End of buffer */
2265 break;
2266
2267 *dst++= '\0';
2268 *dst++= '\0';
2269 *dst++= '\0';
2270 *dst++= *fmt; /* Copy ordinary char */
2271 continue;
2272 }
2273
2274 fmt++;
2275
2276 /* Skip if max size is used (to be compatible with printf) */
2277 while ( (*fmt>='0' && *fmt<='9') || *fmt == '.' || *fmt == '-')
2278 fmt++;
2279
2280 if (*fmt == 'l')
2281 fmt++;
2282
2283 if (*fmt == 's') /* String parameter */
2284 {
2285 char *par= va_arg(ap, char *);
2286 size_t plen;
2287 size_t left_len= (size_t)(end - dst);
2288 if (!par) par= (char*)"(null)";
2289 plen= strlen(par);
2290 if (left_len <= plen*4)
2291 plen= left_len / 4 - 1;
2292
2293 for ( ; plen ; plen--, dst+= 4, par++)
2294 {
2295 dst[0]= '\0';
2296 dst[1]= '\0';
2297 dst[2]= '\0';
2298 dst[3]= par[0];
2299 }
2300 continue;
2301 }
2302 else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
2303 {
2304 int iarg;
2305 char nbuf[16];
2306 char *pbuf= nbuf;
2307
2308 if ((size_t) (end - dst) < 64)
2309 break;
2310 iarg= va_arg(ap, int);
2311 if (*fmt == 'd')
2312 int10_to_str((long) iarg, nbuf, -10);
2313 else
2314 int10_to_str((long) (uint) iarg,nbuf,10);
2315
2316 for (; pbuf[0]; pbuf++)
2317 {
2318 *dst++= '\0';
2319 *dst++= '\0';
2320 *dst++= '\0';
2321 *dst++= *pbuf;
2322 }
2323 continue;
2324 }
2325
2326 /* We come here on '%%', unknown code or too long parameter */
2327 if (dst == end)
2328 break;
2329 *dst++= '\0';
2330 *dst++= '\0';
2331 *dst++= '\0';
2332 *dst++= '%'; /* % used as % or unknown code */
2333 }
2334
2335 assert(dst < end);
2336 *dst++= '\0';
2337 *dst++= '\0';
2338 *dst++= '\0';
2339 *dst++= '\0'; /* End of errmessage */
2340 return (size_t) (dst - start - 4);
2341 }
2342
2343
2344 static size_t
my_snprintf_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * to,size_t n,const char * fmt,...)2345 my_snprintf_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2346 char* to, size_t n, const char* fmt, ...)
2347 {
2348 size_t retval;
2349 va_list args;
2350 va_start(args,fmt);
2351 retval= my_vsnprintf_utf32(to, n, fmt, args);
2352 va_end(args);
2353 return retval;
2354 }
2355
2356
2357 static longlong
my_strtoll10_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * nptr,char ** endptr,int * error)2358 my_strtoll10_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2359 const char *nptr, char **endptr, int *error)
2360 {
2361 const char *s, *end, *start, *n_end, *true_end;
2362 uchar c;
2363 unsigned long i, j, k;
2364 ulonglong li;
2365 int negative;
2366 ulong cutoff, cutoff2, cutoff3;
2367
2368 s= nptr;
2369 /* If fixed length string */
2370 if (endptr)
2371 {
2372 /* Make sure string length is even */
2373 end= s + ((*endptr - s) / 4) * 4;
2374 while (s < end && !s[0] && !s[1] && !s[2] &&
2375 (s[3] == ' ' || s[3] == '\t'))
2376 s+= 4;
2377 if (s == end)
2378 goto no_conv;
2379 }
2380 else
2381 {
2382 /* We don't support null terminated strings in UCS2 */
2383 goto no_conv;
2384 }
2385
2386 /* Check for a sign. */
2387 negative= 0;
2388 if (!s[0] && !s[1] && !s[2] && s[3] == '-')
2389 {
2390 *error= -1; /* Mark as negative number */
2391 negative= 1;
2392 s+= 4;
2393 if (s == end)
2394 goto no_conv;
2395 cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
2396 cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
2397 cutoff3= MAX_NEGATIVE_NUMBER % 100;
2398 }
2399 else
2400 {
2401 *error= 0;
2402 if (!s[0] && !s[1] && !s[2] && s[3] == '+')
2403 {
2404 s+= 4;
2405 if (s == end)
2406 goto no_conv;
2407 }
2408 cutoff= ULONGLONG_MAX / LFACTOR2;
2409 cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
2410 cutoff3= ULONGLONG_MAX % 100;
2411 }
2412
2413 /* Handle case where we have a lot of pre-zero */
2414 if (!s[0] && !s[1] && !s[2] && s[3] == '0')
2415 {
2416 i= 0;
2417 do
2418 {
2419 s+= 4;
2420 if (s == end)
2421 goto end_i; /* Return 0 */
2422 }
2423 while (!s[0] && !s[1] && !s[2] && s[3] == '0');
2424 n_end= s + 4 * INIT_CNT;
2425 }
2426 else
2427 {
2428 /* Read first digit to check that it's a valid number */
2429 if (s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2430 goto no_conv;
2431 i= c;
2432 s+= 4;
2433 n_end= s + 4 * (INIT_CNT-1);
2434 }
2435
2436 /* Handle first 9 digits and store them in i */
2437 if (n_end > end)
2438 n_end= end;
2439 for (; s != n_end ; s+= 4)
2440 {
2441 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2442 goto end_i;
2443 i= i * 10 + c;
2444 }
2445 if (s == end)
2446 goto end_i;
2447
2448 /* Handle next 9 digits and store them in j */
2449 j= 0;
2450 start= s; /* Used to know how much to shift i */
2451 n_end= true_end= s + 4 * INIT_CNT;
2452 if (n_end > end)
2453 n_end= end;
2454 do
2455 {
2456 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2457 goto end_i_and_j;
2458 j= j * 10 + c;
2459 s+= 4;
2460 } while (s != n_end);
2461 if (s == end)
2462 {
2463 if (s != true_end)
2464 goto end_i_and_j;
2465 goto end3;
2466 }
2467 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2468 goto end3;
2469
2470 /* Handle the next 1 or 2 digits and store them in k */
2471 k=c;
2472 s+= 4;
2473 if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2474 goto end4;
2475 k= k * 10 + c;
2476 s+= 2;
2477 *endptr= (char*) s;
2478
2479 /* number string should have ended here */
2480 if (s != end && !s[0] && !s[1] && !s[2] && (c= (s[3] - '0')) <= 9)
2481 goto overflow;
2482
2483 /* Check that we didn't get an overflow with the last digit */
2484 if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
2485 k > cutoff3)))
2486 goto overflow;
2487 li= i * LFACTOR2+ (ulonglong) j * 100 + k;
2488 return (longlong) li;
2489
2490 overflow: /* *endptr is set here */
2491 *error= MY_ERRNO_ERANGE;
2492 return negative ? LLONG_MIN : (longlong) ULONGLONG_MAX;
2493
2494 end_i:
2495 *endptr= (char*) s;
2496 return (negative ? ((longlong) -(long) i) : (longlong) i);
2497
2498 end_i_and_j:
2499 li= (ulonglong) i * lfactor[(size_t) (s-start) / 4] + j;
2500 *endptr= (char*) s;
2501 return (negative ? -((longlong) li) : (longlong) li);
2502
2503 end3:
2504 li= (ulonglong) i*LFACTOR+ (ulonglong) j;
2505 *endptr= (char*) s;
2506 return (negative ? -((longlong) li) : (longlong) li);
2507
2508 end4:
2509 li= (ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
2510 *endptr= (char*) s;
2511 if (negative)
2512 {
2513 if (li > MAX_NEGATIVE_NUMBER)
2514 goto overflow;
2515 return -((longlong) li);
2516 }
2517 return (longlong) li;
2518
2519 no_conv:
2520 /* There was no number to convert. */
2521 *error= MY_ERRNO_EDOM;
2522 *endptr= (char *) nptr;
2523 return 0;
2524 }
2525
2526
2527 static size_t
my_numchars_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e)2528 my_numchars_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2529 const char *b, const char *e)
2530 {
2531 return (size_t) (e - b) / 4;
2532 }
2533
2534
2535 static size_t
my_charpos_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t pos)2536 my_charpos_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2537 const char *b, const char *e, size_t pos)
2538 {
2539 size_t string_length= (size_t) (e - b);
2540 return pos * 4 > string_length ? string_length + 4 : pos * 4;
2541 }
2542
2543
2544 static size_t
my_well_formed_len_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t nchars,int * error)2545 my_well_formed_len_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2546 const char *b, const char *e,
2547 size_t nchars, int *error)
2548 {
2549 /* Ensure string length is divisible by 4 */
2550 const char *b0= b;
2551 size_t length= e - b;
2552 assert((length % 4) == 0);
2553 *error= 0;
2554 nchars*= 4;
2555 if (length > nchars)
2556 {
2557 length= nchars;
2558 e= b + nchars;
2559 }
2560 for (; b < e; b+= 4)
2561 {
2562 /* Don't accept characters greater than U+10FFFF */
2563 if (b[0] || (uchar) b[1] > 0x10)
2564 {
2565 *error= 1;
2566 return b - b0;
2567 }
2568 }
2569 return length;
2570 }
2571
2572
2573 static
my_fill_utf32(const CHARSET_INFO * cs,char * s,size_t slen,int fill)2574 void my_fill_utf32(const CHARSET_INFO *cs,
2575 char *s, size_t slen, int fill)
2576 {
2577 char buf[10];
2578 char *e= s + slen;
2579
2580 assert((slen % 4) == 0);
2581 {
2582 #ifndef NDEBUG
2583 uint buflen=
2584 #endif
2585 cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
2586 (uchar*) buf + sizeof(buf));
2587 assert(buflen == 4);
2588 }
2589 while (s < e)
2590 {
2591 memcpy(s, buf, 4);
2592 s+= 4;
2593 }
2594 }
2595
2596
2597 static size_t
my_lengthsp_utf32(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * ptr,size_t length)2598 my_lengthsp_utf32(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2599 const char *ptr, size_t length)
2600 {
2601 const char *end= ptr + length;
2602 assert((length % 4) == 0);
2603 while (end > ptr + 3 && end[-1] == ' ' && !end[-2] && !end[-3] && !end[-4])
2604 end-= 4;
2605 return (size_t) (end - ptr);
2606 }
2607
2608
2609 static int
my_wildcmp_utf32_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2610 my_wildcmp_utf32_ci(const CHARSET_INFO *cs,
2611 const char *str, const char *str_end,
2612 const char *wildstr, const char *wildend,
2613 int escape, int w_one, int w_many)
2614 {
2615 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2616 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2617 escape, w_one, w_many, uni_plane);
2618 }
2619
2620
2621 static int
my_wildcmp_utf32_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2622 my_wildcmp_utf32_bin(const CHARSET_INFO *cs,
2623 const char *str,const char *str_end,
2624 const char *wildstr,const char *wildend,
2625 int escape, int w_one, int w_many)
2626 {
2627 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2628 escape, w_one, w_many, NULL);
2629 }
2630
2631
2632 static int
my_strnncoll_utf32_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)2633 my_strnncoll_utf32_bin(const CHARSET_INFO *cs,
2634 const uchar *s, size_t slen,
2635 const uchar *t, size_t tlen,
2636 my_bool t_is_prefix)
2637 {
2638 my_wc_t s_wc= 0, t_wc= 0;
2639 const uchar *se= s + slen;
2640 const uchar *te= t + tlen;
2641
2642 while (s < se && t < te)
2643 {
2644 int s_res= my_utf32_uni(cs, &s_wc, s, se);
2645 int t_res= my_utf32_uni(cs, &t_wc, t, te);
2646
2647 if (s_res <= 0 || t_res <= 0)
2648 {
2649 /* Incorrect string, compare by char value */
2650 return my_bincmp(s, se, t, te);
2651 }
2652 if (s_wc != t_wc)
2653 {
2654 return s_wc > t_wc ? 1 : -1;
2655 }
2656
2657 s+= s_res;
2658 t+= t_res;
2659 }
2660 return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t)));
2661 }
2662
2663
2664 static inline my_wc_t
my_utf32_get(const uchar * s)2665 my_utf32_get(const uchar *s)
2666 {
2667 return
2668 ((my_wc_t) s[0] << 24) +
2669 ((my_wc_t) s[1] << 16) +
2670 ((my_wc_t) s[2] << 8) +
2671 s[3];
2672 }
2673
2674
2675 static int
my_strnncollsp_utf32_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))2676 my_strnncollsp_utf32_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2677 const uchar *s, size_t slen,
2678 const uchar *t, size_t tlen,
2679 my_bool diff_if_only_endspace_difference
2680 MY_ATTRIBUTE((unused)))
2681 {
2682 const uchar *se, *te;
2683 size_t minlen;
2684
2685 assert((slen % 4) == 0);
2686 assert((tlen % 4) == 0);
2687
2688 se= s + slen;
2689 te= t + tlen;
2690
2691 for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4)
2692 {
2693 my_wc_t s_wc= my_utf32_get(s);
2694 my_wc_t t_wc= my_utf32_get(t);
2695 if (s_wc != t_wc)
2696 return s_wc > t_wc ? 1 : -1;
2697
2698 s+= 4;
2699 t+= 4;
2700 }
2701
2702 if (slen != tlen)
2703 {
2704 int swap= 1;
2705 if (slen < tlen)
2706 {
2707 s= t;
2708 se= te;
2709 swap= -1;
2710 }
2711
2712 for ( ; s < se ; s+= 4)
2713 {
2714 my_wc_t s_wc= my_utf32_get(s);
2715 if (s_wc != ' ')
2716 return (s_wc < ' ') ? -swap : swap;
2717 }
2718 }
2719 return 0;
2720 }
2721
2722
2723 static size_t
my_scan_utf32(const CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)2724 my_scan_utf32(const CHARSET_INFO *cs,
2725 const char *str, const char *end, int sequence_type)
2726 {
2727 const char *str0= str;
2728
2729 switch (sequence_type)
2730 {
2731 case MY_SEQ_SPACES:
2732 for ( ; str < end; )
2733 {
2734 my_wc_t wc;
2735 int res= my_utf32_uni(cs, &wc, (uchar*) str, (uchar*) end);
2736 if (res < 0 || wc != ' ')
2737 break;
2738 str+= res;
2739 }
2740 return (size_t) (str - str0);
2741 default:
2742 return 0;
2743 }
2744 }
2745
2746
2747 static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
2748 {
2749 NULL, /* init */
2750 my_strnncoll_utf32,
2751 my_strnncollsp_utf32,
2752 my_strnxfrm_unicode,
2753 my_strnxfrmlen_utf32,
2754 my_like_range_generic,
2755 my_wildcmp_utf32_ci,
2756 my_strcasecmp_mb2_or_mb4,
2757 my_instr_mb,
2758 my_hash_sort_utf32,
2759 my_propagate_simple
2760 };
2761
2762
2763 static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
2764 {
2765 NULL, /* init */
2766 my_strnncoll_utf32_bin,
2767 my_strnncollsp_utf32_bin,
2768 my_strnxfrm_unicode_full_bin,
2769 my_strnxfrmlen_unicode_full_bin,
2770 my_like_range_generic,
2771 my_wildcmp_utf32_bin,
2772 my_strcasecmp_mb2_or_mb4,
2773 my_instr_mb,
2774 my_hash_sort_utf32,
2775 my_propagate_simple
2776 };
2777
2778
2779 MY_CHARSET_HANDLER my_charset_utf32_handler=
2780 {
2781 NULL, /* init */
2782 my_ismbchar_utf32,
2783 my_mbcharlen_utf32,
2784 my_numchars_utf32,
2785 my_charpos_utf32,
2786 my_well_formed_len_utf32,
2787 my_lengthsp_utf32,
2788 my_numcells_mb,
2789 my_utf32_uni,
2790 my_uni_utf32,
2791 my_mb_ctype_mb,
2792 my_caseup_str_mb2_or_mb4,
2793 my_casedn_str_mb2_or_mb4,
2794 my_caseup_utf32,
2795 my_casedn_utf32,
2796 my_snprintf_utf32,
2797 my_l10tostr_mb2_or_mb4,
2798 my_ll10tostr_mb2_or_mb4,
2799 my_fill_utf32,
2800 my_strntol_mb2_or_mb4,
2801 my_strntoul_mb2_or_mb4,
2802 my_strntoll_mb2_or_mb4,
2803 my_strntoull_mb2_or_mb4,
2804 my_strntod_mb2_or_mb4,
2805 my_strtoll10_utf32,
2806 my_strntoull10rnd_mb2_or_mb4,
2807 my_scan_utf32
2808 };
2809
2810
2811 CHARSET_INFO my_charset_utf32_general_ci=
2812 {
2813 60,0,0, /* number */
2814 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_UNICODE_SUPPLEMENT|MY_CS_NONASCII,
2815 "utf32", /* cs name */
2816 "utf32_general_ci", /* name */
2817 "UTF-32 Unicode", /* comment */
2818 NULL, /* tailoring */
2819 NULL, /* ctype */
2820 NULL, /* to_lower */
2821 NULL, /* to_upper */
2822 NULL, /* sort_order */
2823 NULL, /* uca */
2824 NULL, /* tab_to_uni */
2825 NULL, /* tab_from_uni */
2826 &my_unicase_default, /* caseinfo */
2827 NULL, /* state_map */
2828 NULL, /* ident_map */
2829 1, /* strxfrm_multiply */
2830 1, /* caseup_multiply */
2831 1, /* casedn_multiply */
2832 4, /* mbminlen */
2833 4, /* mbmaxlen */
2834 1, /* mbmaxlenlen */
2835 0, /* min_sort_char */
2836 0xFFFF, /* max_sort_char */
2837 ' ', /* pad char */
2838 0, /* escape_with_backslash_is_dangerous */
2839 1, /* levels_for_compare */
2840 1, /* levels_for_order */
2841 &my_charset_utf32_handler,
2842 &my_collation_utf32_general_ci_handler
2843 };
2844
2845
2846 CHARSET_INFO my_charset_utf32_bin=
2847 {
2848 61,0,0, /* number */
2849 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2850 "utf32", /* cs name */
2851 "utf32_bin", /* name */
2852 "UTF-32 Unicode", /* comment */
2853 NULL, /* tailoring */
2854 NULL, /* ctype */
2855 NULL, /* to_lower */
2856 NULL, /* to_upper */
2857 NULL, /* sort_order */
2858 NULL, /* uca */
2859 NULL, /* tab_to_uni */
2860 NULL, /* tab_from_uni */
2861 &my_unicase_default, /* caseinfo */
2862 NULL, /* state_map */
2863 NULL, /* ident_map */
2864 1, /* strxfrm_multiply */
2865 1, /* caseup_multiply */
2866 1, /* casedn_multiply */
2867 4, /* mbminlen */
2868 4, /* mbmaxlen */
2869 1, /* mbmaxlenlen */
2870 0, /* min_sort_char */
2871 0xFFFF, /* max_sort_char */
2872 ' ', /* pad char */
2873 0, /* escape_with_backslash_is_dangerous */
2874 1, /* levels_for_compare */
2875 1, /* levels_for_order */
2876 &my_charset_utf32_handler,
2877 &my_collation_utf32_bin_handler
2878 };
2879
2880
2881 #endif /* HAVE_CHARSET_utf32 */
2882
2883
2884 #ifdef HAVE_CHARSET_ucs2
2885
2886 static const uchar ctype_ucs2[] = {
2887 0,
2888 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
2889 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2890 72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
2891 132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
2892 16,129,129,129,129,129,129, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16,
2894 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2895 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32,
2896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2904 };
2905
2906 static const uchar to_lower_ucs2[] = {
2907 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2908 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2909 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2910 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2911 64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2912 112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
2913 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2914 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2915 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2916 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2917 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2918 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2919 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2920 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2921 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2922 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2923 };
2924
2925 static const uchar to_upper_ucs2[] = {
2926 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2927 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2928 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2929 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2930 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2931 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2932 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2933 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
2934 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2935 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2936 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2937 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2938 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2939 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2940 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2941 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2942 };
2943
2944
my_ucs2_uni(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t * pwc,const uchar * s,const uchar * e)2945 static int my_ucs2_uni(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
2946 my_wc_t * pwc, const uchar *s, const uchar *e)
2947 {
2948 if (s+2 > e) /* Need 2 characters */
2949 return MY_CS_TOOSMALL2;
2950
2951 *pwc= ((uchar)s[0]) * 256 + ((uchar)s[1]);
2952 return 2;
2953 }
2954
my_uni_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),my_wc_t wc,uchar * r,uchar * e)2955 static int my_uni_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
2956 my_wc_t wc, uchar *r, uchar *e)
2957 {
2958 if ( r+2 > e )
2959 return MY_CS_TOOSMALL2;
2960
2961 if (wc > 0xFFFF) /* UCS2 does not support characters outside BMP */
2962 return MY_CS_ILUNI;
2963
2964 r[0]= (uchar) (wc >> 8);
2965 r[1]= (uchar) (wc & 0xFF);
2966 return 2;
2967 }
2968
2969
2970 static inline void
my_tolower_ucs2(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2971 my_tolower_ucs2(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2972 {
2973 const MY_UNICASE_CHARACTER *page;
2974 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2975 *wc= page[*wc & 0xFF].tolower;
2976 }
2977
2978
2979 static inline void
my_toupper_ucs2(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2980 my_toupper_ucs2(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2981 {
2982 const MY_UNICASE_CHARACTER *page;
2983 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2984 *wc= page[*wc & 0xFF].toupper;
2985 }
2986
2987
2988 static inline void
my_tosort_ucs2(const MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2989 my_tosort_ucs2(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2990 {
2991 const MY_UNICASE_CHARACTER *page;
2992 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
2993 *wc= page[*wc & 0xFF].sort;
2994 }
2995
2996
my_caseup_ucs2(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))2997 static size_t my_caseup_ucs2(const CHARSET_INFO *cs, char *src, size_t srclen,
2998 char *dst MY_ATTRIBUTE((unused)),
2999 size_t dstlen MY_ATTRIBUTE((unused)))
3000 {
3001 my_wc_t wc;
3002 int res;
3003 char *srcend= src + srclen;
3004 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3005 assert(src == dst && srclen == dstlen);
3006
3007 while ((src < srcend) &&
3008 (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
3009 {
3010 my_toupper_ucs2(uni_plane, &wc);
3011 if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
3012 break;
3013 src+= res;
3014 }
3015 return srclen;
3016 }
3017
3018
my_hash_sort_ucs2(const CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * n1,ulong * n2)3019 static void my_hash_sort_ucs2(const CHARSET_INFO *cs, const uchar *s,
3020 size_t slen, ulong *n1, ulong *n2)
3021 {
3022 my_wc_t wc;
3023 int res;
3024 const uchar *e=s+slen;
3025 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3026 ulong tmp1;
3027 ulong tmp2;
3028
3029 while (e > s+1 && e[-1] == ' ' && e[-2] == '\0')
3030 e-= 2;
3031
3032 tmp1= *n1;
3033 tmp2= *n2;
3034
3035 while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
3036 {
3037 my_tosort_ucs2(uni_plane, &wc);
3038 tmp1^= (((tmp1 & 63) + tmp2) * (wc & 0xFF)) + (tmp1 << 8);
3039 tmp2+=3;
3040 tmp1^= (((tmp1 & 63) + tmp2) * (wc >> 8)) + (tmp1 << 8);
3041 tmp2+=3;
3042 s+=res;
3043 }
3044
3045 *n1= tmp1;
3046 *n2= tmp2;
3047 }
3048
3049
my_casedn_ucs2(const CHARSET_INFO * cs,char * src,size_t srclen,char * dst MY_ATTRIBUTE ((unused)),size_t dstlen MY_ATTRIBUTE ((unused)))3050 static size_t my_casedn_ucs2(const CHARSET_INFO *cs, char *src, size_t srclen,
3051 char *dst MY_ATTRIBUTE((unused)),
3052 size_t dstlen MY_ATTRIBUTE((unused)))
3053 {
3054 my_wc_t wc;
3055 int res;
3056 char *srcend= src + srclen;
3057 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3058 assert(src == dst && srclen == dstlen);
3059
3060 while ((src < srcend) &&
3061 (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
3062 {
3063 my_tolower_ucs2(uni_plane, &wc);
3064 if (res != my_uni_ucs2(cs, wc, (uchar*) src, (uchar*) srcend))
3065 break;
3066 src+= res;
3067 }
3068 return srclen;
3069 }
3070
3071
3072 static void
my_fill_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),char * s,size_t l,int fill)3073 my_fill_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3074 char *s, size_t l, int fill)
3075 {
3076 assert(fill <= 0xFFFF);
3077 for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
3078 }
3079
3080
my_strnncoll_ucs2(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)3081 static int my_strnncoll_ucs2(const CHARSET_INFO *cs,
3082 const uchar *s, size_t slen,
3083 const uchar *t, size_t tlen,
3084 my_bool t_is_prefix)
3085 {
3086 int s_res,t_res;
3087 my_wc_t s_wc= 0, t_wc= 0;
3088 const uchar *se=s+slen;
3089 const uchar *te=t+tlen;
3090 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3091
3092 while ( s < se && t < te )
3093 {
3094 s_res=my_ucs2_uni(cs,&s_wc, s, se);
3095 t_res=my_ucs2_uni(cs,&t_wc, t, te);
3096
3097 if ( s_res <= 0 || t_res <= 0 )
3098 {
3099 /* Incorrect string, compare by char value */
3100 return ((int)s[0]-(int)t[0]);
3101 }
3102
3103 my_tosort_ucs2(uni_plane, &s_wc);
3104 my_tosort_ucs2(uni_plane, &t_wc);
3105
3106 if ( s_wc != t_wc )
3107 {
3108 return s_wc > t_wc ? 1 : -1;
3109 }
3110
3111 s+=s_res;
3112 t+=t_res;
3113 }
3114 return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
3115 }
3116
3117 /*
3118 Compare strings, discarding end space
3119
3120 SYNOPSIS
3121 my_strnncollsp_ucs2()
3122 cs character set handler
3123 a First string to compare
3124 a_length Length of 'a'
3125 b Second string to compare
3126 b_length Length of 'b'
3127
3128 IMPLEMENTATION
3129 If one string is shorter as the other, then we space extend the other
3130 so that the strings have equal length.
3131
3132 This will ensure that the following things hold:
3133
3134 "a" == "a "
3135 "a\0" < "a"
3136 "a\0" < "a "
3137
3138 RETURN
3139 < 0 a < b
3140 = 0 a == b
3141 > 0 a > b
3142 */
3143
my_strnncollsp_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))3144 static int my_strnncollsp_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3145 const uchar *s, size_t slen,
3146 const uchar *t, size_t tlen,
3147 my_bool diff_if_only_endspace_difference
3148 MY_ATTRIBUTE((unused)))
3149 {
3150 const uchar *se, *te;
3151 size_t minlen;
3152 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3153
3154 /* extra safety to make sure the lengths are even numbers */
3155 slen&= ~1;
3156 tlen&= ~1;
3157
3158 se= s + slen;
3159 te= t + tlen;
3160
3161 for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
3162 {
3163 int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort :
3164 (((int) s[0]) << 8) + (int) s[1];
3165
3166 int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort :
3167 (((int) t[0]) << 8) + (int) t[1];
3168 if ( s_wc != t_wc )
3169 return s_wc > t_wc ? 1 : -1;
3170
3171 s+= 2;
3172 t+= 2;
3173 }
3174
3175 if (slen != tlen)
3176 {
3177 int swap= 1;
3178 if (slen < tlen)
3179 {
3180 s= t;
3181 se= te;
3182 swap= -1;
3183 }
3184
3185 for ( ; s < se ; s+= 2)
3186 {
3187 if (s[0] || s[1] != ' ')
3188 return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
3189 }
3190 }
3191 return 0;
3192 }
3193
3194
my_ismbchar_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)))3195 static uint my_ismbchar_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3196 const char *b MY_ATTRIBUTE((unused)),
3197 const char *e MY_ATTRIBUTE((unused)))
3198 {
3199 return 2;
3200 }
3201
3202
my_mbcharlen_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),uint c MY_ATTRIBUTE ((unused)))3203 static uint my_mbcharlen_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)) ,
3204 uint c MY_ATTRIBUTE((unused)))
3205 {
3206 return 2;
3207 }
3208
3209
3210 static
my_numchars_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e)3211 size_t my_numchars_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3212 const char *b, const char *e)
3213 {
3214 return (size_t) (e-b)/2;
3215 }
3216
3217
3218 static
my_charpos_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b MY_ATTRIBUTE ((unused)),const char * e MY_ATTRIBUTE ((unused)),size_t pos)3219 size_t my_charpos_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3220 const char *b MY_ATTRIBUTE((unused)),
3221 const char *e MY_ATTRIBUTE((unused)),
3222 size_t pos)
3223 {
3224 size_t string_length= (size_t) (e - b);
3225 return pos > string_length ? string_length + 2 : pos * 2;
3226 }
3227
3228
3229 static
my_well_formed_len_ucs2(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const char * b,const char * e,size_t nchars,int * error)3230 size_t my_well_formed_len_ucs2(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3231 const char *b, const char *e,
3232 size_t nchars, int *error)
3233 {
3234 /* Ensure string length is dividable with 2 */
3235 size_t nbytes= ((size_t) (e-b)) & ~(size_t) 1;
3236 *error= 0;
3237 nchars*= 2;
3238 return MY_MIN(nbytes, nchars);
3239 }
3240
3241
3242 static
my_wildcmp_ucs2_ci(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3243 int my_wildcmp_ucs2_ci(const CHARSET_INFO *cs,
3244 const char *str,const char *str_end,
3245 const char *wildstr,const char *wildend,
3246 int escape, int w_one, int w_many)
3247 {
3248 const MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3249 return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3250 escape,w_one,w_many,uni_plane);
3251 }
3252
3253
3254 static
my_wildcmp_ucs2_bin(const CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3255 int my_wildcmp_ucs2_bin(const CHARSET_INFO *cs,
3256 const char *str,const char *str_end,
3257 const char *wildstr,const char *wildend,
3258 int escape, int w_one, int w_many)
3259 {
3260 return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3261 escape,w_one,w_many,NULL);
3262 }
3263
3264
3265 static
my_strnncoll_ucs2_bin(const CHARSET_INFO * cs,const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool t_is_prefix)3266 int my_strnncoll_ucs2_bin(const CHARSET_INFO *cs,
3267 const uchar *s, size_t slen,
3268 const uchar *t, size_t tlen,
3269 my_bool t_is_prefix)
3270 {
3271 int s_res,t_res;
3272 my_wc_t s_wc= 0, t_wc= 0;
3273 const uchar *se=s+slen;
3274 const uchar *te=t+tlen;
3275
3276 while ( s < se && t < te )
3277 {
3278 s_res=my_ucs2_uni(cs,&s_wc, s, se);
3279 t_res=my_ucs2_uni(cs,&t_wc, t, te);
3280
3281 if ( s_res <= 0 || t_res <= 0 )
3282 {
3283 /* Incorrect string, compare by char value */
3284 return ((int)s[0]-(int)t[0]);
3285 }
3286 if ( s_wc != t_wc )
3287 {
3288 return s_wc > t_wc ? 1 : -1;
3289 }
3290
3291 s+=s_res;
3292 t+=t_res;
3293 }
3294 return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
3295 }
3296
my_strnncollsp_ucs2_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * s,size_t slen,const uchar * t,size_t tlen,my_bool diff_if_only_endspace_difference MY_ATTRIBUTE ((unused)))3297 static int my_strnncollsp_ucs2_bin(const CHARSET_INFO *cs
3298 MY_ATTRIBUTE((unused)),
3299 const uchar *s, size_t slen,
3300 const uchar *t, size_t tlen,
3301 my_bool diff_if_only_endspace_difference
3302 MY_ATTRIBUTE((unused)))
3303 {
3304 const uchar *se, *te;
3305 size_t minlen;
3306
3307 /* extra safety to make sure the lengths are even numbers */
3308 slen= (slen >> 1) << 1;
3309 tlen= (tlen >> 1) << 1;
3310
3311 se= s + slen;
3312 te= t + tlen;
3313
3314 for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
3315 {
3316 int s_wc= s[0] * 256 + s[1];
3317 int t_wc= t[0] * 256 + t[1];
3318 if ( s_wc != t_wc )
3319 return s_wc > t_wc ? 1 : -1;
3320
3321 s+= 2;
3322 t+= 2;
3323 }
3324
3325 if (slen != tlen)
3326 {
3327 int swap= 1;
3328 if (slen < tlen)
3329 {
3330 s= t;
3331 se= te;
3332 swap= -1;
3333 }
3334
3335 for ( ; s < se ; s+= 2)
3336 {
3337 if (s[0] || s[1] != ' ')
3338 return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
3339 }
3340 }
3341 return 0;
3342 }
3343
3344
3345 static
my_hash_sort_ucs2_bin(const CHARSET_INFO * cs MY_ATTRIBUTE ((unused)),const uchar * key,size_t len,ulong * nr1,ulong * nr2)3346 void my_hash_sort_ucs2_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)),
3347 const uchar *key, size_t len,ulong *nr1, ulong *nr2)
3348 {
3349 const uchar *pos = key;
3350 ulong tmp1;
3351 ulong tmp2;
3352
3353 key+= len;
3354
3355 while (key > pos+1 && key[-1] == ' ' && key[-2] == '\0')
3356 key-= 2;
3357
3358 tmp1= *nr1;
3359 tmp2= *nr2;
3360
3361 for (; pos < (uchar*) key ; pos++)
3362 {
3363 tmp1^=(ulong) ((((uint) tmp1 & 63) + tmp2) *
3364 ((uint)*pos)) + (tmp1 << 8);
3365 tmp2+=3;
3366 }
3367
3368 *nr1= tmp1;
3369 *nr2= tmp2;
3370 }
3371
3372
3373 static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
3374 {
3375 NULL, /* init */
3376 my_strnncoll_ucs2,
3377 my_strnncollsp_ucs2,
3378 my_strnxfrm_unicode,
3379 my_strnxfrmlen_simple,
3380 my_like_range_generic,
3381 my_wildcmp_ucs2_ci,
3382 my_strcasecmp_mb2_or_mb4,
3383 my_instr_mb,
3384 my_hash_sort_ucs2,
3385 my_propagate_simple
3386 };
3387
3388
3389 static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
3390 {
3391 NULL, /* init */
3392 my_strnncoll_ucs2_bin,
3393 my_strnncollsp_ucs2_bin,
3394 my_strnxfrm_unicode,
3395 my_strnxfrmlen_simple,
3396 my_like_range_generic,
3397 my_wildcmp_ucs2_bin,
3398 my_strcasecmp_mb2_or_mb4,
3399 my_instr_mb,
3400 my_hash_sort_ucs2_bin,
3401 my_propagate_simple
3402 };
3403
3404
3405 MY_CHARSET_HANDLER my_charset_ucs2_handler=
3406 {
3407 NULL, /* init */
3408 my_ismbchar_ucs2, /* ismbchar */
3409 my_mbcharlen_ucs2, /* mbcharlen */
3410 my_numchars_ucs2,
3411 my_charpos_ucs2,
3412 my_well_formed_len_ucs2,
3413 my_lengthsp_mb2,
3414 my_numcells_mb,
3415 my_ucs2_uni, /* mb_wc */
3416 my_uni_ucs2, /* wc_mb */
3417 my_mb_ctype_mb,
3418 my_caseup_str_mb2_or_mb4,
3419 my_casedn_str_mb2_or_mb4,
3420 my_caseup_ucs2,
3421 my_casedn_ucs2,
3422 my_snprintf_mb2,
3423 my_l10tostr_mb2_or_mb4,
3424 my_ll10tostr_mb2_or_mb4,
3425 my_fill_ucs2,
3426 my_strntol_mb2_or_mb4,
3427 my_strntoul_mb2_or_mb4,
3428 my_strntoll_mb2_or_mb4,
3429 my_strntoull_mb2_or_mb4,
3430 my_strntod_mb2_or_mb4,
3431 my_strtoll10_mb2,
3432 my_strntoull10rnd_mb2_or_mb4,
3433 my_scan_mb2
3434 };
3435
3436
3437 CHARSET_INFO my_charset_ucs2_general_ci=
3438 {
3439 35,0,0, /* number */
3440 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
3441 "ucs2", /* cs name */
3442 "ucs2_general_ci", /* name */
3443 "", /* comment */
3444 NULL, /* tailoring */
3445 ctype_ucs2, /* ctype */
3446 to_lower_ucs2, /* to_lower */
3447 to_upper_ucs2, /* to_upper */
3448 to_upper_ucs2, /* sort_order */
3449 NULL, /* uca */
3450 NULL, /* tab_to_uni */
3451 NULL, /* tab_from_uni */
3452 &my_unicase_default,/* caseinfo */
3453 NULL, /* state_map */
3454 NULL, /* ident_map */
3455 1, /* strxfrm_multiply */
3456 1, /* caseup_multiply */
3457 1, /* casedn_multiply */
3458 2, /* mbminlen */
3459 2, /* mbmaxlen */
3460 1, /* mbmaxlenlen */
3461 0, /* min_sort_char */
3462 0xFFFF, /* max_sort_char */
3463 ' ', /* pad char */
3464 0, /* escape_with_backslash_is_dangerous */
3465 1, /* levels_for_compare */
3466 1, /* levels_for_order */
3467 &my_charset_ucs2_handler,
3468 &my_collation_ucs2_general_ci_handler
3469 };
3470
3471
3472 CHARSET_INFO my_charset_ucs2_general_mysql500_ci=
3473 {
3474 159, 0, 0, /* number */
3475 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, /* state */
3476 "ucs2", /* cs name */
3477 "ucs2_general_mysql500_ci", /* name */
3478 "", /* comment */
3479 NULL, /* tailoring */
3480 ctype_ucs2, /* ctype */
3481 to_lower_ucs2, /* to_lower */
3482 to_upper_ucs2, /* to_upper */
3483 to_upper_ucs2, /* sort_order */
3484 NULL, /* uca */
3485 NULL, /* tab_to_uni */
3486 NULL, /* tab_from_uni */
3487 &my_unicase_mysql500, /* caseinfo */
3488 NULL, /* state_map */
3489 NULL, /* ident_map */
3490 1, /* strxfrm_multiply */
3491 1, /* caseup_multiply */
3492 1, /* casedn_multiply */
3493 2, /* mbminlen */
3494 2, /* mbmaxlen */
3495 1, /* mbmaxlenlen */
3496 0, /* min_sort_char */
3497 0xFFFF, /* max_sort_char */
3498 ' ', /* pad char */
3499 0, /* escape_with_backslash_is_dangerous */
3500 1, /* levels_for_compare */
3501 1, /* levels_for_order */
3502 &my_charset_ucs2_handler,
3503 &my_collation_ucs2_general_ci_handler
3504 };
3505
3506
3507 CHARSET_INFO my_charset_ucs2_bin=
3508 {
3509 90,0,0, /* number */
3510 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
3511 "ucs2", /* cs name */
3512 "ucs2_bin", /* name */
3513 "", /* comment */
3514 NULL, /* tailoring */
3515 ctype_ucs2, /* ctype */
3516 to_lower_ucs2, /* to_lower */
3517 to_upper_ucs2, /* to_upper */
3518 NULL, /* sort_order */
3519 NULL, /* uca */
3520 NULL, /* tab_to_uni */
3521 NULL, /* tab_from_uni */
3522 &my_unicase_default,/* caseinfo */
3523 NULL, /* state_map */
3524 NULL, /* ident_map */
3525 1, /* strxfrm_multiply */
3526 1, /* caseup_multiply */
3527 1, /* casedn_multiply */
3528 2, /* mbminlen */
3529 2, /* mbmaxlen */
3530 1, /* mbmaxlenlen */
3531 0, /* min_sort_char */
3532 0xFFFF, /* max_sort_char */
3533 ' ', /* pad char */
3534 0, /* escape_with_backslash_is_dangerous */
3535 1, /* levels_for_compare */
3536 1, /* levels_for_order */
3537 &my_charset_ucs2_handler,
3538 &my_collation_ucs2_bin_handler
3539 };
3540
3541
3542 #endif /* HAVE_CHARSET_ucs2 */
3543