1 /* Copyright (c) 2003, 2013, Oracle and/or its affiliates
2 Copyright (c) 2009, 2020, MariaDB
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Library General Public
6 License as published by the Free Software Foundation; version 2
7 of the License.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
13
14 You should have received a copy of the GNU Library General Public
15 License along with this library; if not, write to the Free
16 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
17 MA 02110-1335 USA */
18
19 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
20
21 #include "strings_def.h"
22 #include <m_ctype.h>
23 #include <my_sys.h>
24 #include <stdarg.h>
25
26 #include "ctype-unidata.h"
27
28
29 #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
30 #define HAVE_CHARSET_mb2
31 #endif
32
33
34 #if defined(HAVE_CHARSET_mb2) || defined(HAVE_CHARSET_utf32)
35 #define HAVE_CHARSET_mb2_or_mb4
36 #endif
37
38 #ifndef EILSEQ
39 #define EILSEQ ENOENT
40 #endif
41
42 #undef ULONGLONG_MAX
43 #define ULONGLONG_MAX (~(ulonglong) 0)
44 #define MAX_NEGATIVE_NUMBER ((ulonglong) 0x8000000000000000LL)
45 #define INIT_CNT 9
46 #define LFACTOR 1000000000ULL
47 #define LFACTOR1 10000000000ULL
48 #define LFACTOR2 100000000000ULL
49
50 #if defined(HAVE_CHARSET_utf32) || defined(HAVE_CHARSET_mb2)
51 static unsigned long lfactor[9]=
52 { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L };
53 #endif
54
55
56 #ifdef HAVE_CHARSET_mb2_or_mb4
57 static size_t
my_caseup_str_mb2_or_mb4(CHARSET_INFO * cs,char * s)58 my_caseup_str_mb2_or_mb4(CHARSET_INFO * cs __attribute__((unused)),
59 char * s __attribute__((unused)))
60 {
61 DBUG_ASSERT(0);
62 return 0;
63 }
64
65
66 static size_t
my_casedn_str_mb2_or_mb4(CHARSET_INFO * cs,char * s)67 my_casedn_str_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
68 char * s __attribute__((unused)))
69 {
70 DBUG_ASSERT(0);
71 return 0;
72 }
73
74
75 static int
my_strcasecmp_mb2_or_mb4(CHARSET_INFO * cs,const char * s,const char * t)76 my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
77 const char *s __attribute__((unused)),
78 const char *t __attribute__((unused)))
79 {
80 DBUG_ASSERT(0);
81 return 0;
82 }
83
84
85 typedef enum
86 {
87 MY_CHAR_COPY_OK= 0, /* The character was Okey */
88 MY_CHAR_COPY_ERROR= 1, /* The character was not Ok, and could not fix */
89 MY_CHAR_COPY_FIXED= 2 /* The character was not Ok, was fixed to '?' */
90 } my_char_copy_status_t;
91
92
93 /*
94 Copies an incomplete character, lef-padding it with 0x00 bytes.
95
96 @param cs Character set
97 @param dst The destination string
98 @param dst_length Space available in dst
99 @param src The source string
100 @param src_length Length of src
101 @param nchars Copy not more than nchars characters.
102 The "nchars" parameter of the caller.
103 Only 0 and non-0 are important here.
104 @param fix What to do if after zero-padding didn't get a valid
105 character:
106 - FALSE - exit with error.
107 - TRUE - try to put '?' instead.
108
109 @return MY_CHAR_COPY_OK if after zero-padding got a valid character.
110 cs->mbmaxlen bytes were written to "dst".
111 @return MY_CHAR_COPY_FIXED if after zero-padding did not get a valid
112 character, but wrote '?' to the destination
113 string instead.
114 cs->mbminlen bytes were written to "dst".
115 @return MY_CHAR_COPY_ERROR If failed and nothing was written to "dst".
116 Possible reasons:
117 - dst_length was too short
118 - nchars was 0
119 - the character after padding appeared not
120 to be valid, and could not fix it to '?'.
121 */
122 static my_char_copy_status_t
my_copy_incomplete_char(CHARSET_INFO * cs,char * dst,size_t dst_length,const char * src,size_t src_length,size_t nchars,my_bool fix)123 my_copy_incomplete_char(CHARSET_INFO *cs,
124 char *dst, size_t dst_length,
125 const char *src, size_t src_length,
126 size_t nchars, my_bool fix)
127 {
128 size_t pad_length;
129 size_t src_offset= src_length % cs->mbminlen;
130 if (dst_length < cs->mbminlen || !nchars)
131 return MY_CHAR_COPY_ERROR;
132
133 pad_length= cs->mbminlen - src_offset;
134 bzero(dst, pad_length);
135 memmove(dst + pad_length, src, src_offset);
136 /*
137 In some cases left zero-padding can create an incorrect character.
138 For example:
139 INSERT INTO t1 (utf32_column) VALUES (0x110000);
140 We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
141 The valid characters range is limited to 0x00000000..0x0010FFFF.
142
143 Make sure we didn't pad to an incorrect character.
144 */
145 if (my_ci_charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
146 (int) cs->mbminlen)
147 return MY_CHAR_COPY_OK;
148
149 if (fix &&
150 my_ci_wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
151 (int) cs->mbminlen)
152 return MY_CHAR_COPY_FIXED;
153
154 return MY_CHAR_COPY_ERROR;
155 }
156
157
158 /*
159 Copy an UCS2/UTF16/UTF32 string, fix bad characters.
160 */
161 static size_t
my_copy_fix_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t dst_length,const char * src,size_t src_length,size_t nchars,MY_STRCOPY_STATUS * status)162 my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs,
163 char *dst, size_t dst_length,
164 const char *src, size_t src_length,
165 size_t nchars, MY_STRCOPY_STATUS *status)
166 {
167 size_t length2, src_offset= src_length % cs->mbminlen;
168 my_char_copy_status_t padstatus;
169
170 if (!src_offset)
171 return my_copy_fix_mb(cs, dst, dst_length,
172 src, src_length, nchars, status);
173 if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length,
174 src, src_length, nchars, TRUE)) ==
175 MY_CHAR_COPY_ERROR)
176 {
177 status->m_source_end_pos= status->m_well_formed_error_pos= src;
178 return 0;
179 }
180 length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen,
181 src + src_offset, src_length - src_offset,
182 nchars - 1, status);
183 if (padstatus == MY_CHAR_COPY_FIXED)
184 status->m_well_formed_error_pos= src;
185 return cs->mbminlen /* The left-padded character */ + length2;
186 }
187
188
189 static long
my_strntol_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)190 my_strntol_mb2_or_mb4(CHARSET_INFO *cs,
191 const char *nptr, size_t l, int base,
192 char **endptr, int *err)
193 {
194 int negative= 0;
195 int overflow;
196 int cnv;
197 my_wc_t wc;
198 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
199 register unsigned int cutlim;
200 register uint32 cutoff;
201 register uint32 res;
202 register const uchar *s= (const uchar*) nptr;
203 register const uchar *e= (const uchar*) nptr+l;
204 const uchar *save;
205
206 *err= 0;
207 do
208 {
209 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
210 {
211 switch (wc)
212 {
213 case ' ' : break;
214 case '\t': break;
215 case '-' : negative= !negative; break;
216 case '+' : break;
217 default : goto bs;
218 }
219 }
220 else /* No more characters or bad multibyte sequence */
221 {
222 if (endptr != NULL )
223 *endptr= (char*) s;
224 err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
225 return 0;
226 }
227 s+= cnv;
228 } while (1);
229
230 bs:
231
232 overflow= 0;
233 res= 0;
234 save= s;
235 cutoff= ((uint32)~0L) / (uint32) base;
236 cutlim= (uint) (((uint32)~0L) % (uint32) base);
237
238 do {
239 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
240 {
241 s+= cnv;
242 if (wc >= '0' && wc <= '9')
243 wc-= '0';
244 else if (wc >= 'A' && wc <= 'Z')
245 wc= wc - 'A' + 10;
246 else if (wc >= 'a' && wc <= 'z')
247 wc= wc - 'a' + 10;
248 else
249 break;
250 if ((int)wc >= base)
251 break;
252 if (res > cutoff || (res == cutoff && wc > cutlim))
253 overflow= 1;
254 else
255 {
256 res*= (uint32) base;
257 res+= wc;
258 }
259 }
260 else if (cnv == MY_CS_ILSEQ)
261 {
262 if (endptr !=NULL )
263 *endptr = (char*) s;
264 err[0]= EILSEQ;
265 return 0;
266 }
267 else
268 {
269 /* No more characters */
270 break;
271 }
272 } while(1);
273
274 if (endptr != NULL)
275 *endptr = (char *) s;
276
277 if (s == save)
278 {
279 err[0]= EDOM;
280 return 0L;
281 }
282
283 if (negative)
284 {
285 if (res > (uint32) INT_MIN32)
286 overflow= 1;
287 }
288 else if (res > INT_MAX32)
289 overflow= 1;
290
291 if (overflow)
292 {
293 err[0]= ERANGE;
294 return negative ? INT_MIN32 : INT_MAX32;
295 }
296
297 return (negative ? -((long) res) : (long) res);
298 }
299
300
301 static ulong
my_strntoul_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)302 my_strntoul_mb2_or_mb4(CHARSET_INFO *cs,
303 const char *nptr, size_t l, int base,
304 char **endptr, int *err)
305 {
306 int negative= 0;
307 int overflow;
308 int cnv;
309 my_wc_t wc;
310 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
311 register unsigned int cutlim;
312 register uint32 cutoff;
313 register uint32 res;
314 register const uchar *s= (const uchar*) nptr;
315 register const uchar *e= (const uchar*) nptr + l;
316 const uchar *save;
317
318 *err= 0;
319 do
320 {
321 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
322 {
323 switch (wc)
324 {
325 case ' ' : break;
326 case '\t': break;
327 case '-' : negative= !negative; break;
328 case '+' : break;
329 default : goto bs;
330 }
331 }
332 else /* No more characters or bad multibyte sequence */
333 {
334 if (endptr !=NULL )
335 *endptr= (char*)s;
336 err[0]= (cnv == MY_CS_ILSEQ) ? EILSEQ : EDOM;
337 return 0;
338 }
339 s+= cnv;
340 } while (1);
341
342 bs:
343
344 overflow= 0;
345 res= 0;
346 save= s;
347 cutoff= ((uint32)~0L) / (uint32) base;
348 cutlim= (uint) (((uint32)~0L) % (uint32) base);
349
350 do
351 {
352 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
353 {
354 s+= cnv;
355 if (wc >= '0' && wc <= '9')
356 wc-= '0';
357 else if (wc >= 'A' && wc <= 'Z')
358 wc= wc - 'A' + 10;
359 else if (wc >= 'a' && wc <= 'z')
360 wc= wc - 'a' + 10;
361 else
362 break;
363 if ((int) wc >= base)
364 break;
365 if (res > cutoff || (res == cutoff && wc > cutlim))
366 overflow = 1;
367 else
368 {
369 res*= (uint32) base;
370 res+= wc;
371 }
372 }
373 else if (cnv == MY_CS_ILSEQ)
374 {
375 if (endptr != NULL )
376 *endptr= (char*)s;
377 err[0]= EILSEQ;
378 return 0;
379 }
380 else
381 {
382 /* No more characters */
383 break;
384 }
385 } while(1);
386
387 if (endptr != NULL)
388 *endptr= (char *) s;
389
390 if (s == save)
391 {
392 err[0]= EDOM;
393 return 0L;
394 }
395
396 if (overflow)
397 {
398 err[0]= (ERANGE);
399 return (~(uint32) 0);
400 }
401
402 return (negative ? -((long) res) : (long) res);
403 }
404
405
406 static longlong
my_strntoll_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)407 my_strntoll_mb2_or_mb4(CHARSET_INFO *cs,
408 const char *nptr, size_t l, int base,
409 char **endptr, int *err)
410 {
411 int negative=0;
412 int overflow;
413 int cnv;
414 my_wc_t wc;
415 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
416 register ulonglong cutoff;
417 register unsigned int cutlim;
418 register ulonglong res;
419 register const uchar *s= (const uchar*) nptr;
420 register const uchar *e= (const uchar*) nptr+l;
421 const uchar *save;
422
423 *err= 0;
424 do
425 {
426 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
427 {
428 switch (wc)
429 {
430 case ' ' : break;
431 case '\t': break;
432 case '-' : negative= !negative; break;
433 case '+' : break;
434 default : goto bs;
435 }
436 }
437 else /* No more characters or bad multibyte sequence */
438 {
439 if (endptr !=NULL )
440 *endptr = (char*)s;
441 err[0] = (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
442 return 0;
443 }
444 s+=cnv;
445 } while (1);
446
447 bs:
448
449 overflow = 0;
450 res = 0;
451 save = s;
452 cutoff = (~(ulonglong) 0) / (unsigned long int) base;
453 cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
454
455 do {
456 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
457 {
458 s+=cnv;
459 if ( wc>='0' && wc<='9')
460 wc -= '0';
461 else if ( wc>='A' && wc<='Z')
462 wc = wc - 'A' + 10;
463 else if ( wc>='a' && wc<='z')
464 wc = wc - 'a' + 10;
465 else
466 break;
467 if ((int)wc >= base)
468 break;
469 if (res > cutoff || (res == cutoff && wc > cutlim))
470 overflow = 1;
471 else
472 {
473 res *= (ulonglong) base;
474 res += wc;
475 }
476 }
477 else if (cnv==MY_CS_ILSEQ)
478 {
479 if (endptr !=NULL )
480 *endptr = (char*)s;
481 err[0]=EILSEQ;
482 return 0;
483 }
484 else
485 {
486 /* No more characters */
487 break;
488 }
489 } while(1);
490
491 if (endptr != NULL)
492 *endptr = (char *) s;
493
494 if (s == save)
495 {
496 err[0]=EDOM;
497 return 0L;
498 }
499
500 if (negative)
501 {
502 if (res > (ulonglong) LONGLONG_MIN)
503 overflow = 1;
504 }
505 else if (res > (ulonglong) LONGLONG_MAX)
506 overflow = 1;
507
508 if (overflow)
509 {
510 err[0]=ERANGE;
511 return negative ? LONGLONG_MIN : LONGLONG_MAX;
512 }
513
514 return (negative ? -((longlong)res) : (longlong)res);
515 }
516
517
518 static ulonglong
my_strntoull_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)519 my_strntoull_mb2_or_mb4(CHARSET_INFO *cs,
520 const char *nptr, size_t l, int base,
521 char **endptr, int *err)
522 {
523 int negative= 0;
524 int overflow;
525 int cnv;
526 my_wc_t wc;
527 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
528 register ulonglong cutoff;
529 register unsigned int cutlim;
530 register ulonglong res;
531 register const uchar *s= (const uchar*) nptr;
532 register const uchar *e= (const uchar*) nptr + l;
533 const uchar *save;
534
535 *err= 0;
536 do
537 {
538 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
539 {
540 switch (wc)
541 {
542 case ' ' : break;
543 case '\t': break;
544 case '-' : negative= !negative; break;
545 case '+' : break;
546 default : goto bs;
547 }
548 }
549 else /* No more characters or bad multibyte sequence */
550 {
551 if (endptr !=NULL )
552 *endptr = (char*)s;
553 err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
554 return 0;
555 }
556 s+=cnv;
557 } while (1);
558
559 bs:
560
561 overflow = 0;
562 res = 0;
563 save = s;
564 cutoff = (~(ulonglong) 0) / (unsigned long int) base;
565 cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
566
567 do
568 {
569 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
570 {
571 s+=cnv;
572 if ( wc>='0' && wc<='9')
573 wc -= '0';
574 else if ( wc>='A' && wc<='Z')
575 wc = wc - 'A' + 10;
576 else if ( wc>='a' && wc<='z')
577 wc = wc - 'a' + 10;
578 else
579 break;
580 if ((int)wc >= base)
581 break;
582 if (res > cutoff || (res == cutoff && wc > cutlim))
583 overflow = 1;
584 else
585 {
586 res *= (ulonglong) base;
587 res += wc;
588 }
589 }
590 else if (cnv==MY_CS_ILSEQ)
591 {
592 if (endptr !=NULL )
593 *endptr = (char*)s;
594 err[0]= EILSEQ;
595 return 0;
596 }
597 else
598 {
599 /* No more characters */
600 break;
601 }
602 } while(1);
603
604 if (endptr != NULL)
605 *endptr = (char *) s;
606
607 if (s == save)
608 {
609 err[0]= EDOM;
610 return 0L;
611 }
612
613 if (overflow)
614 {
615 err[0]= ERANGE;
616 return (~(ulonglong) 0);
617 }
618
619 return (negative ? -((longlong) res) : (longlong) res);
620 }
621
622
623 static double
my_strntod_mb2_or_mb4(CHARSET_INFO * cs,char * nptr,size_t length,char ** endptr,int * err)624 my_strntod_mb2_or_mb4(CHARSET_INFO *cs,
625 char *nptr, size_t length,
626 char **endptr, int *err)
627 {
628 char buf[256];
629 double res;
630 register char *b= buf;
631 register const uchar *s= (const uchar*) nptr;
632 const uchar *end;
633 my_wc_t wc;
634 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
635 int cnv;
636
637 *err= 0;
638 /* Cut too long strings */
639 if (length >= sizeof(buf))
640 length= sizeof(buf) - 1;
641 end= s + length;
642
643 while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
644 {
645 s+= cnv;
646 if (wc > (int) (uchar) 'e' || !wc)
647 break; /* Can't be part of double */
648 *b++= (char) wc;
649 }
650
651 *endptr= b;
652 res= my_strtod(buf, endptr, err);
653 *endptr= nptr + cs->mbminlen * (size_t) (*endptr - buf);
654 return res;
655 }
656
657
658 static ulonglong
my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t length,int unsign_fl,char ** endptr,int * err)659 my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs,
660 const char *nptr, size_t length,
661 int unsign_fl,
662 char **endptr, int *err)
663 {
664 char buf[256], *b= buf;
665 ulonglong res;
666 const uchar *end, *s= (const uchar*) nptr;
667 my_wc_t wc;
668 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
669 int cnv;
670
671 /* Cut too long strings */
672 if (length >= sizeof(buf))
673 length= sizeof(buf)-1;
674 end= s + length;
675
676 while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
677 {
678 s+= cnv;
679 if (wc > (int) (uchar) 'e' || !wc)
680 break; /* Can't be a number part */
681 *b++= (char) wc;
682 }
683
684 res= my_strntoull10rnd_8bit(cs, buf, b - buf, unsign_fl, endptr, err);
685 *endptr= (char*) nptr + cs->mbminlen * (size_t) (*endptr - buf);
686 return res;
687 }
688
689
690 /*
691 This is a fast version optimized for the case of radix 10 / -10
692 */
693
694 static size_t
my_l10tostr_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t len,int radix,long int val)695 my_l10tostr_mb2_or_mb4(CHARSET_INFO *cs,
696 char *dst, size_t len, int radix, long int val)
697 {
698 char buffer[66];
699 register char *p, *db, *de;
700 long int new_val;
701 int sl= 0;
702 unsigned long int uval = (unsigned long int) val;
703
704 p= &buffer[sizeof(buffer) - 1];
705 *p= '\0';
706
707 if (radix < 0)
708 {
709 if (val < 0)
710 {
711 sl= 1;
712 /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
713 uval = (unsigned long int)0 - uval;
714 }
715 }
716
717 new_val = (long) (uval / 10);
718 *--p = '0'+ (char) (uval - (unsigned long) new_val * 10);
719 val= new_val;
720
721 while (val != 0)
722 {
723 new_val= val / 10;
724 *--p= '0' + (char) (val - new_val * 10);
725 val= new_val;
726 }
727
728 if (sl)
729 {
730 *--p= '-';
731 }
732
733 for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
734 {
735 int cnvres= my_ci_wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
736 if (cnvres > 0)
737 dst+= cnvres;
738 else
739 break;
740 }
741 return (int) (dst - db);
742 }
743
744
745 static size_t
my_ll10tostr_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t len,int radix,longlong val)746 my_ll10tostr_mb2_or_mb4(CHARSET_INFO *cs,
747 char *dst, size_t len, int radix, longlong val)
748 {
749 char buffer[65];
750 register char *p, *db, *de;
751 long long_val;
752 int sl= 0;
753 ulonglong uval= (ulonglong) val;
754
755 if (radix < 0)
756 {
757 if (val < 0)
758 {
759 sl= 1;
760 /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
761 uval = (ulonglong)0 - uval;
762 }
763 }
764
765 p= &buffer[sizeof(buffer)-1];
766 *p='\0';
767
768 if (uval == 0)
769 {
770 *--p= '0';
771 goto cnv;
772 }
773
774 while (uval > (ulonglong) LONG_MAX)
775 {
776 ulonglong quo= uval/(uint) 10;
777 uint rem= (uint) (uval- quo* (uint) 10);
778 *--p= '0' + rem;
779 uval= quo;
780 }
781
782 long_val= (long) uval;
783 while (long_val != 0)
784 {
785 long quo= long_val/10;
786 *--p= (char) ('0' + (long_val - quo*10));
787 long_val= quo;
788 }
789
790 cnv:
791 if (sl)
792 {
793 *--p= '-';
794 }
795
796 for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
797 {
798 int cnvres= my_ci_wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
799 if (cnvres > 0)
800 dst+= cnvres;
801 else
802 break;
803 }
804 return (int) (dst -db);
805 }
806
807 #endif /* HAVE_CHARSET_mb2_or_mb4 */
808
809
810 #ifdef HAVE_CHARSET_mb2
811 /**
812 Convert a Unicode code point to a digit.
813 @param wc - the input Unicode code point
814 @param[OUT] c - the output character representing the digit value 0..9
815
816 @return 0 - if wc is a good digit
817 @return 1 - if wc is not a digit
818 */
819 static inline my_bool
wc2digit_uchar(uchar * c,my_wc_t wc)820 wc2digit_uchar(uchar *c, my_wc_t wc)
821 {
822 return wc > '9' || (c[0]= (uchar) (wc - '0')) > 9;
823 }
824
825
826 static longlong
my_strtoll10_mb2(CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)827 my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
828 const char *nptr, char **endptr, int *error)
829 {
830 const uchar *s, *end, *start, *n_end, *true_end;
831 uchar UNINIT_VAR(c);
832 unsigned long i, j, k;
833 ulonglong li;
834 int negative;
835 ulong cutoff, cutoff2, cutoff3;
836 my_wc_t wc;
837 int res;
838 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
839
840 s= (const uchar *) nptr;
841 /* If fixed length string */
842 if (endptr)
843 {
844 /*
845 Make sure string length is even.
846 Odd length indicates a bug in the caller.
847 Assert in debug, round in production.
848 */
849 DBUG_ASSERT((*endptr - (const char *) s) % 2 == 0);
850 end= s + ((*endptr - (const char*) s) / 2) * 2;
851
852 for ( ; ; ) /* Skip leading spaces and tabs */
853 {
854 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
855 goto no_conv;
856 s+= res;
857 if (wc != ' ' && wc != '\t')
858 break;
859 }
860 }
861 else
862 {
863 /* We don't support null terminated strings in UCS2 */
864 goto no_conv;
865 }
866
867 /* Check for a sign. */
868 negative= 0;
869 if (wc == '-')
870 {
871 *error= -1; /* Mark as negative number */
872 negative= 1;
873 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
874 goto no_conv;
875 s+= res; /* wc is now expected to hold the first digit. */
876 cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
877 cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
878 cutoff3= MAX_NEGATIVE_NUMBER % 100;
879 }
880 else
881 {
882 *error= 0;
883 if (wc == '+')
884 {
885 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
886 goto no_conv;
887 s+= res; /* wc is now expected to hold the first digit. */
888 }
889 cutoff= ULONGLONG_MAX / LFACTOR2;
890 cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
891 cutoff3= ULONGLONG_MAX % 100;
892 }
893
894 /*
895 The code below assumes that 'wc' holds the first digit
896 and 's' points to the next character after it.
897
898 Scan pre-zeros if any.
899 */
900 if (wc == '0')
901 {
902 i= 0;
903 for ( ; ; s+= res)
904 {
905 if (s == end)
906 goto end_i; /* Return 0 */
907 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
908 goto no_conv;
909 if (wc != '0')
910 break;
911 }
912 n_end= s + 2 * INIT_CNT;
913 }
914 else
915 {
916 /* Read first digit to check that it's a valid number */
917 if ((i= (wc - '0')) > 9)
918 goto no_conv;
919 n_end= s + 2 * (INIT_CNT-1);
920 }
921
922 /* Handle first 9 digits and store them in i */
923 if (n_end > end)
924 n_end= end;
925 for ( ; ; s+= res)
926 {
927 if ((res= mb_wc(cs, &wc, s, n_end)) <= 0)
928 break;
929 if (wc2digit_uchar(&c, wc))
930 goto end_i;
931 i= i*10+c;
932 }
933 if (s == end)
934 goto end_i;
935
936 /* Handle next 9 digits and store them in j */
937 j= 0;
938 start= s; /* Used to know how much to shift i */
939 n_end= true_end= s + 2 * INIT_CNT;
940 if (n_end > end)
941 n_end= end;
942 do
943 {
944 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
945 goto no_conv;
946 if (wc2digit_uchar(&c, wc))
947 goto end_i_and_j;
948 s+= res;
949 j= j * 10 + c;
950 } while (s != n_end);
951 if (s == end)
952 {
953 if (s != true_end)
954 goto end_i_and_j;
955 goto end3;
956 }
957
958 /* Handle the next 1 or 2 digits and store them in k */
959 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
960 goto no_conv;
961 if ((k= (wc - '0')) > 9)
962 goto end3;
963 s+= res;
964
965 if (s == end)
966 goto end4;
967 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
968 goto no_conv;
969 if (wc2digit_uchar(&c, wc))
970 goto end4;
971 s+= res;
972 k= k*10+c;
973 *endptr= (char*) s;
974
975 /* number string should have ended here */
976 if (s != end && mb_wc(cs, &wc, s, end) > 0 && ((uchar) (wc - '0')) <= 9)
977 goto overflow;
978
979 /* Check that we didn't get an overflow with the last digit */
980 if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
981 k > cutoff3)))
982 goto overflow;
983 li=i*LFACTOR2+ (ulonglong) j*100 + k;
984 return (longlong) li;
985
986 overflow: /* *endptr is set here */
987 *error= MY_ERRNO_ERANGE;
988 return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
989
990 end_i:
991 *endptr= (char*) s;
992 return (negative ? ((longlong) -(long) i) : (longlong) i);
993
994 end_i_and_j:
995 li= (ulonglong) i * lfactor[(size_t) (s-start) / 2] + j;
996 *endptr= (char*) s;
997 return (negative ? -((longlong) li) : (longlong) li);
998
999 end3:
1000 li=(ulonglong) i*LFACTOR+ (ulonglong) j;
1001 *endptr= (char*) s;
1002 return (negative ? -((longlong) li) : (longlong) li);
1003
1004 end4:
1005 li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
1006 *endptr= (char*) s;
1007 if (negative)
1008 {
1009 if (li > MAX_NEGATIVE_NUMBER)
1010 goto overflow;
1011 return -((longlong) li);
1012 }
1013 return (longlong) li;
1014
1015 no_conv:
1016 /* There was no number to convert. */
1017 *error= MY_ERRNO_EDOM;
1018 *endptr= (char *) nptr;
1019 return 0;
1020 }
1021
1022
1023 static size_t
my_scan_mb2(CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)1024 my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)),
1025 const char *str, const char *end, int sequence_type)
1026 {
1027 const char *str0= str;
1028 my_wc_t wc;
1029 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1030 int res;
1031
1032 switch (sequence_type)
1033 {
1034 case MY_SEQ_SPACES:
1035 for (res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
1036 res > 0 && wc == ' ';
1037 str+= res,
1038 res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end))
1039 {
1040 }
1041 return (size_t) (str - str0);
1042 case MY_SEQ_NONSPACES:
1043 DBUG_ASSERT(0); /* Not implemented */
1044 /* pass through */
1045 default:
1046 return 0;
1047 }
1048 }
1049
1050
1051 static void
my_fill_mb2(CHARSET_INFO * cs,char * s,size_t slen,int fill)1052 my_fill_mb2(CHARSET_INFO *cs, char *s, size_t slen, int fill)
1053 {
1054 char buf[10], *last;
1055 size_t buflen, remainder;
1056
1057 DBUG_ASSERT((slen % 2) == 0);
1058
1059 buflen= my_ci_wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
1060 (uchar*) buf + sizeof(buf));
1061
1062 DBUG_ASSERT(buflen > 0);
1063
1064 /*
1065 "last" in the last position where a sequence of "buflen" bytes can start.
1066 */
1067 for (last= s + slen - buflen; s <= last; s+= buflen)
1068 {
1069 /* Enough space for the character */
1070 memcpy(s, buf, buflen);
1071 }
1072
1073 /*
1074 If there are some more space which is not enough
1075 for the whole multibyte character, then add trailing zeros.
1076 */
1077 if ((remainder= last + buflen - s) > 0)
1078 bzero(s, (size_t) remainder);
1079 }
1080
1081
1082 static size_t
my_vsnprintf_mb2(char * dst,size_t n,const char * fmt,va_list ap)1083 my_vsnprintf_mb2(char *dst, size_t n, const char* fmt, va_list ap)
1084 {
1085 char *start=dst, *end= dst + n - 1;
1086 for (; *fmt ; fmt++)
1087 {
1088 if (fmt[0] != '%')
1089 {
1090 if (dst == end) /* End of buffer */
1091 break;
1092
1093 *dst++='\0';
1094 *dst++= *fmt; /* Copy ordinary char */
1095 continue;
1096 }
1097
1098 fmt++;
1099
1100 /* Skip if max size is used (to be compatible with printf) */
1101 while ( (*fmt >= '0' && *fmt <= '9') || *fmt == '.' || *fmt == '-')
1102 fmt++;
1103
1104 if (*fmt == 'l')
1105 fmt++;
1106
1107 if (*fmt == 's') /* String parameter */
1108 {
1109 char *par= va_arg(ap, char *);
1110 size_t plen;
1111 size_t left_len= (size_t)(end-dst);
1112 if (!par)
1113 par= (char*) "(null)";
1114 plen= strlen(par);
1115 if (left_len <= plen * 2)
1116 plen = left_len / 2 - 1;
1117
1118 for ( ; plen ; plen--, dst+=2, par++)
1119 {
1120 dst[0]= '\0';
1121 dst[1]= par[0];
1122 }
1123 continue;
1124 }
1125 else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
1126 {
1127 int iarg;
1128 char nbuf[16];
1129 char *pbuf= nbuf;
1130
1131 if ((size_t) (end - dst) < 32)
1132 break;
1133 iarg= va_arg(ap, int);
1134 if (*fmt == 'd')
1135 int10_to_str((long) iarg, nbuf, -10);
1136 else
1137 int10_to_str((long) (uint) iarg, nbuf,10);
1138
1139 for (; pbuf[0]; pbuf++)
1140 {
1141 *dst++= '\0';
1142 *dst++= *pbuf;
1143 }
1144 continue;
1145 }
1146
1147 /* We come here on '%%', unknown code or too long parameter */
1148 if (dst == end)
1149 break;
1150 *dst++= '\0';
1151 *dst++= '%'; /* % used as % or unknown code */
1152 }
1153
1154 DBUG_ASSERT(dst <= end);
1155 *dst='\0'; /* End of errmessage */
1156 return (size_t) (dst - start);
1157 }
1158
1159
1160 static size_t
my_snprintf_mb2(CHARSET_INFO * cs,char * to,size_t n,const char * fmt,...)1161 my_snprintf_mb2(CHARSET_INFO *cs __attribute__((unused)),
1162 char* to, size_t n, const char* fmt, ...)
1163 {
1164 size_t ret;
1165 va_list args;
1166 va_start(args,fmt);
1167 ret= my_vsnprintf_mb2(to, n, fmt, args);
1168 va_end(args);
1169 return ret;
1170 }
1171
1172
1173 static size_t
my_lengthsp_mb2(CHARSET_INFO * cs,const char * ptr,size_t length)1174 my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
1175 const char *ptr, size_t length)
1176 {
1177 const char *end= ptr + length;
1178 while (end > ptr + 1 && end[-1] == ' ' && end[-2] == '\0')
1179 end-= 2;
1180 return (size_t) (end - ptr);
1181 }
1182
1183 #endif /* HAVE_CHARSET_mb2*/
1184
1185
1186 /*
1187 Next part is actually HAVE_CHARSET_utf16-specific,
1188 but the JSON functions needed my_utf16_uni()
1189 so the #ifdef was moved lower.
1190 */
1191 #include "ctype-utf16.h"
1192
1193 #define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0))
1194 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
1195
my_weight_mb2_utf16mb2_general_ci(uchar b0,uchar b1)1196 static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
1197 {
1198 my_wc_t wc= MY_UTF16_WC2(b0, b1);
1199 MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
1200 return (int) (page ? page[wc & 0xFF].sort : wc);
1201 }
1202 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci
1203 #define DEFINE_STRNXFRM_UNICODE
1204 #define DEFINE_STRNXFRM_UNICODE_NOPAD
1205 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf16_quick(pwc, s, e)
1206 #define OPTIMIZE_ASCII 0
1207 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
1208 #define UNICASE_PAGE0 my_unicase_default_page00
1209 #define UNICASE_PAGES my_unicase_default_pages
1210 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1211 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
1212 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1213 #include "strcoll.inl"
1214
1215 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_bin
1216 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1217 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1))
1218 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
1219 #include "strcoll.inl"
1220
1221 #define DEFINE_STRNNCOLLSP_NOPAD
1222 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_nopad_ci
1223 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1224 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
1225 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1226 #include "strcoll.inl"
1227
1228 #define DEFINE_STRNNCOLLSP_NOPAD
1229 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_nopad_bin
1230 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1231 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1))
1232 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
1233 #include "strcoll.inl"
1234
1235 #undef IS_MB2_CHAR
1236 #undef IS_MB4_CHAR
1237
1238 /*
1239 These two functions are used in JSON library, so made exportable
1240 and unconditionally compiled into the library.
1241 */
1242
1243 /*static*/ int
my_utf16_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)1244 my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
1245 my_wc_t *pwc, const uchar *s, const uchar *e)
1246 {
1247 return my_mb_wc_utf16_quick(pwc, s, e);
1248 }
1249
1250
1251 /*static*/ int
my_uni_utf16(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)1252 my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
1253 my_wc_t wc, uchar *s, uchar *e)
1254 {
1255 if (wc <= 0xFFFF)
1256 {
1257 if (s + 2 > e)
1258 return MY_CS_TOOSMALL2;
1259 if (MY_UTF16_SURROGATE(wc))
1260 return MY_CS_ILUNI;
1261 *s++= (uchar) (wc >> 8);
1262 *s= (uchar) (wc & 0xFF);
1263 return 2;
1264 }
1265
1266 if (wc <= 0x10FFFF)
1267 {
1268 if (s + 4 > e)
1269 return MY_CS_TOOSMALL4;
1270 *s++= (uchar) ((wc-= 0x10000) >> 18) | 0xD8;
1271 *s++= (uchar) (wc >> 10) & 0xFF;
1272 *s++= (uchar) ((wc >> 8) & 3) | 0xDC;
1273 *s= (uchar) wc & 0xFF;
1274 return 4;
1275 }
1276
1277 return MY_CS_ILUNI;
1278 }
1279
1280
1281 #ifdef HAVE_CHARSET_utf16
1282
1283 const char charset_name_utf16le[]= "utf16le";
1284
1285 static inline void
my_tolower_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1286 my_tolower_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1287 {
1288 MY_UNICASE_CHARACTER *page;
1289 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1290 *wc= page[*wc & 0xFF].tolower;
1291 }
1292
1293
1294 static inline void
my_toupper_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1295 my_toupper_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1296 {
1297 MY_UNICASE_CHARACTER *page;
1298 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1299 *wc= page[*wc & 0xFF].toupper;
1300 }
1301
1302
1303 static inline void
my_tosort_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1304 my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1305 {
1306 if (*wc <= uni_plane->maxchar)
1307 {
1308 MY_UNICASE_CHARACTER *page;
1309 if ((page= uni_plane->page[*wc >> 8]))
1310 *wc= page[*wc & 0xFF].sort;
1311 }
1312 else
1313 {
1314 *wc= MY_CS_REPLACEMENT_CHARACTER;
1315 }
1316 }
1317
1318
1319
1320 static size_t
my_caseup_utf16(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)1321 my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
1322 char *dst, size_t dstlen)
1323 {
1324 my_wc_t wc;
1325 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1326 my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
1327 int res;
1328 const char *srcend= src + srclen;
1329 char *dstend= dst + dstlen;
1330 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1331 DBUG_ASSERT(srclen <= dstlen);
1332
1333 while ((src < srcend) &&
1334 (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1335 {
1336 my_toupper_utf16(uni_plane, &wc);
1337 if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
1338 break;
1339 src+= res;
1340 dst+= res;
1341 }
1342 return srclen;
1343 }
1344
1345
1346 static void
my_hash_sort_utf16_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)1347 my_hash_sort_utf16_nopad(CHARSET_INFO *cs,
1348 const uchar *s, size_t slen,
1349 ulong *nr1, ulong *nr2)
1350 {
1351 my_wc_t wc;
1352 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1353 int res;
1354 const uchar *e= s + slen;
1355 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1356 register ulong m1= *nr1, m2= *nr2;
1357
1358 while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
1359 {
1360 my_tosort_utf16(uni_plane, &wc);
1361 MY_HASH_ADD_16(m1, m2, wc);
1362 s+= res;
1363 }
1364 *nr1= m1;
1365 *nr2= m2;
1366 }
1367
1368
1369 static void
my_hash_sort_utf16(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)1370 my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen,
1371 ulong *nr1, ulong *nr2)
1372 {
1373 size_t lengthsp= my_ci_lengthsp(cs, (const char *) s, slen);
1374 my_hash_sort_utf16_nopad(cs, s, lengthsp, nr1, nr2);
1375 }
1376
1377
1378 static size_t
my_casedn_utf16(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)1379 my_casedn_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
1380 char *dst, size_t dstlen)
1381 {
1382 my_wc_t wc;
1383 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1384 my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
1385 int res;
1386 const char *srcend= src + srclen;
1387 char *dstend= dst + dstlen;
1388 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1389 DBUG_ASSERT(srclen <= dstlen);
1390
1391 while ((src < srcend) &&
1392 (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1393 {
1394 my_tolower_utf16(uni_plane, &wc);
1395 if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
1396 break;
1397 src+= res;
1398 dst+= res;
1399 }
1400 return srclen;
1401 }
1402
1403
1404 static int
my_charlen_utf16(CHARSET_INFO * cs,const uchar * str,const uchar * end)1405 my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
1406 {
1407 my_wc_t wc;
1408 return my_ci_mb_wc(cs, &wc, str, end);
1409 }
1410
1411
1412 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16
1413 #define CHARLEN(cs,str,end) my_charlen_utf16(cs,str,end)
1414 #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
1415 #include "ctype-mb.inl"
1416 #undef MY_FUNCTION_NAME
1417 #undef CHARLEN
1418 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
1419 /* Defines my_well_formed_char_length_utf16 */
1420
1421
1422 static size_t
my_numchars_utf16(CHARSET_INFO * cs,const char * b,const char * e)1423 my_numchars_utf16(CHARSET_INFO *cs,
1424 const char *b, const char *e)
1425 {
1426 size_t nchars= 0;
1427 for ( ; ; nchars++)
1428 {
1429 size_t charlen= my_ismbchar(cs, b, e);
1430 if (!charlen)
1431 break;
1432 b+= charlen;
1433 }
1434 return nchars;
1435 }
1436
1437
1438 static size_t
my_charpos_utf16(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)1439 my_charpos_utf16(CHARSET_INFO *cs,
1440 const char *b, const char *e, size_t pos)
1441 {
1442 const char *b0= b;
1443 uint charlen;
1444
1445 for ( ; pos; b+= charlen, pos--)
1446 {
1447 if (!(charlen= my_ismbchar(cs, b, e)))
1448 return (e + 2 - b0); /* Error, return pos outside the string */
1449 }
1450 return (size_t) (pos ? (e + 2 - b0) : (b - b0));
1451 }
1452
1453
1454 static int
my_wildcmp_utf16_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1455 my_wildcmp_utf16_ci(CHARSET_INFO *cs,
1456 const char *str,const char *str_end,
1457 const char *wildstr,const char *wildend,
1458 int escape, int w_one, int w_many)
1459 {
1460 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1461 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1462 escape, w_one, w_many, uni_plane);
1463 }
1464
1465
1466 static int
my_wildcmp_utf16_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1467 my_wildcmp_utf16_bin(CHARSET_INFO *cs,
1468 const char *str,const char *str_end,
1469 const char *wildstr,const char *wildend,
1470 int escape, int w_one, int w_many)
1471 {
1472 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1473 escape, w_one, w_many, NULL);
1474 }
1475
1476
1477 static void
my_hash_sort_utf16_nopad_bin(CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1478 my_hash_sort_utf16_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
1479 const uchar *pos, size_t len,
1480 ulong *nr1, ulong *nr2)
1481 {
1482 const uchar *end= pos + len;
1483 register ulong m1= *nr1, m2= *nr2;
1484
1485 for ( ; pos < end ; pos++)
1486 {
1487 MY_HASH_ADD(m1, m2, (uint)*pos);
1488 }
1489 *nr1= m1;
1490 *nr2= m2;
1491 }
1492
1493
1494 static void
my_hash_sort_utf16_bin(CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1495 my_hash_sort_utf16_bin(CHARSET_INFO *cs,
1496 const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
1497 {
1498 size_t lengthsp= my_ci_lengthsp(cs, (const char *) pos, len);
1499 my_hash_sort_utf16_nopad_bin(cs, pos, lengthsp, nr1, nr2);
1500 }
1501
1502
1503 static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
1504 {
1505 NULL, /* init */
1506 my_strnncoll_utf16_general_ci,
1507 my_strnncollsp_utf16_general_ci,
1508 my_strnncollsp_nchars_utf16_general_ci,
1509 my_strnxfrm_utf16_general_ci,
1510 my_strnxfrmlen_unicode,
1511 my_like_range_generic,
1512 my_wildcmp_utf16_ci,
1513 my_strcasecmp_mb2_or_mb4,
1514 my_instr_mb,
1515 my_hash_sort_utf16,
1516 my_propagate_simple
1517 };
1518
1519
1520 static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
1521 {
1522 NULL, /* init */
1523 my_strnncoll_utf16_bin,
1524 my_strnncollsp_utf16_bin,
1525 my_strnncollsp_nchars_utf16_bin,
1526 my_strnxfrm_unicode_full_bin,
1527 my_strnxfrmlen_unicode_full_bin,
1528 my_like_range_generic,
1529 my_wildcmp_utf16_bin,
1530 my_strcasecmp_mb2_or_mb4,
1531 my_instr_mb,
1532 my_hash_sort_utf16_bin,
1533 my_propagate_simple
1534 };
1535
1536
1537 static MY_COLLATION_HANDLER my_collation_utf16_general_nopad_ci_handler =
1538 {
1539 NULL, /* init */
1540 my_strnncoll_utf16_general_ci,
1541 my_strnncollsp_utf16_general_nopad_ci,
1542 my_strnncollsp_nchars_utf16_general_nopad_ci,
1543 my_strnxfrm_nopad_utf16_general_ci,
1544 my_strnxfrmlen_unicode,
1545 my_like_range_generic,
1546 my_wildcmp_utf16_ci,
1547 my_strcasecmp_mb2_or_mb4,
1548 my_instr_mb,
1549 my_hash_sort_utf16_nopad,
1550 my_propagate_simple
1551 };
1552
1553
1554 static MY_COLLATION_HANDLER my_collation_utf16_nopad_bin_handler =
1555 {
1556 NULL, /* init */
1557 my_strnncoll_utf16_bin,
1558 my_strnncollsp_utf16_nopad_bin,
1559 my_strnncollsp_nchars_utf16_nopad_bin,
1560 my_strnxfrm_unicode_full_nopad_bin,
1561 my_strnxfrmlen_unicode_full_bin,
1562 my_like_range_generic,
1563 my_wildcmp_utf16_bin,
1564 my_strcasecmp_mb2_or_mb4,
1565 my_instr_mb,
1566 my_hash_sort_utf16_nopad_bin,
1567 my_propagate_simple
1568 };
1569
1570
1571 MY_CHARSET_HANDLER my_charset_utf16_handler=
1572 {
1573 NULL, /* init */
1574 my_numchars_utf16,
1575 my_charpos_utf16,
1576 my_lengthsp_mb2,
1577 my_numcells_mb,
1578 my_utf16_uni, /* mb_wc */
1579 my_uni_utf16, /* wc_mb */
1580 my_mb_ctype_mb,
1581 my_caseup_str_mb2_or_mb4,
1582 my_casedn_str_mb2_or_mb4,
1583 my_caseup_utf16,
1584 my_casedn_utf16,
1585 my_snprintf_mb2,
1586 my_l10tostr_mb2_or_mb4,
1587 my_ll10tostr_mb2_or_mb4,
1588 my_fill_mb2,
1589 my_strntol_mb2_or_mb4,
1590 my_strntoul_mb2_or_mb4,
1591 my_strntoll_mb2_or_mb4,
1592 my_strntoull_mb2_or_mb4,
1593 my_strntod_mb2_or_mb4,
1594 my_strtoll10_mb2,
1595 my_strntoull10rnd_mb2_or_mb4,
1596 my_scan_mb2,
1597 my_charlen_utf16,
1598 my_well_formed_char_length_utf16,
1599 my_copy_fix_mb2_or_mb4,
1600 my_uni_utf16,
1601 my_wc_to_printable_generic
1602 };
1603
1604
1605 struct charset_info_st my_charset_utf16_general_ci=
1606 {
1607 54,0,0, /* number */
1608 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1609 charset_name_utf16, /* cs name */
1610 "utf16_general_ci", /* name */
1611 "UTF-16 Unicode", /* comment */
1612 NULL, /* tailoring */
1613 NULL, /* ctype */
1614 NULL, /* to_lower */
1615 NULL, /* to_upper */
1616 NULL, /* sort_order */
1617 NULL, /* uca */
1618 NULL, /* tab_to_uni */
1619 NULL, /* tab_from_uni */
1620 &my_unicase_default, /* caseinfo */
1621 NULL, /* state_map */
1622 NULL, /* ident_map */
1623 1, /* strxfrm_multiply */
1624 1, /* caseup_multiply */
1625 1, /* casedn_multiply */
1626 2, /* mbminlen */
1627 4, /* mbmaxlen */
1628 0, /* min_sort_char */
1629 0xFFFF, /* max_sort_char */
1630 ' ', /* pad char */
1631 0, /* escape_with_backslash_is_dangerous */
1632 1, /* levels_for_order */
1633 &my_charset_utf16_handler,
1634 &my_collation_utf16_general_ci_handler
1635 };
1636
1637
1638 struct charset_info_st my_charset_utf16_bin=
1639 {
1640 55,0,0, /* number */
1641 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1642 charset_name_utf16, /* cs name */
1643 "utf16_bin", /* name */
1644 "UTF-16 Unicode", /* comment */
1645 NULL, /* tailoring */
1646 NULL, /* ctype */
1647 NULL, /* to_lower */
1648 NULL, /* to_upper */
1649 NULL, /* sort_order */
1650 NULL, /* uca */
1651 NULL, /* tab_to_uni */
1652 NULL, /* tab_from_uni */
1653 &my_unicase_default, /* caseinfo */
1654 NULL, /* state_map */
1655 NULL, /* ident_map */
1656 1, /* strxfrm_multiply */
1657 1, /* caseup_multiply */
1658 1, /* casedn_multiply */
1659 2, /* mbminlen */
1660 4, /* mbmaxlen */
1661 0, /* min_sort_char */
1662 0xFFFF, /* max_sort_char */
1663 ' ', /* pad char */
1664 0, /* escape_with_backslash_is_dangerous */
1665 1, /* levels_for_order */
1666 &my_charset_utf16_handler,
1667 &my_collation_utf16_bin_handler
1668 };
1669
1670
1671 struct charset_info_st my_charset_utf16_general_nopad_ci=
1672 {
1673 MY_NOPAD_ID(54),0,0, /* number */
1674 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
1675 charset_name_utf16, /* cs name */
1676 "utf16_general_nopad_ci", /* name */
1677 "UTF-16 Unicode", /* comment */
1678 NULL, /* tailoring */
1679 NULL, /* ctype */
1680 NULL, /* to_lower */
1681 NULL, /* to_upper */
1682 NULL, /* sort_order */
1683 NULL, /* uca */
1684 NULL, /* tab_to_uni */
1685 NULL, /* tab_from_uni */
1686 &my_unicase_default, /* caseinfo */
1687 NULL, /* state_map */
1688 NULL, /* ident_map */
1689 1, /* strxfrm_multiply */
1690 1, /* caseup_multiply */
1691 1, /* casedn_multiply */
1692 2, /* mbminlen */
1693 4, /* mbmaxlen */
1694 0, /* min_sort_char */
1695 0xFFFF, /* max_sort_char */
1696 ' ', /* pad char */
1697 0, /* escape_with_backslash_is_dangerous */
1698 1, /* levels_for_order */
1699 &my_charset_utf16_handler,
1700 &my_collation_utf16_general_nopad_ci_handler
1701 };
1702
1703
1704 struct charset_info_st my_charset_utf16_nopad_bin=
1705 {
1706 MY_NOPAD_ID(55),0,0, /* number */
1707 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
1708 MY_CS_NOPAD,
1709 charset_name_utf16, /* cs name */
1710 "utf16_nopad_bin", /* name */
1711 "UTF-16 Unicode", /* comment */
1712 NULL, /* tailoring */
1713 NULL, /* ctype */
1714 NULL, /* to_lower */
1715 NULL, /* to_upper */
1716 NULL, /* sort_order */
1717 NULL, /* uca */
1718 NULL, /* tab_to_uni */
1719 NULL, /* tab_from_uni */
1720 &my_unicase_default, /* caseinfo */
1721 NULL, /* state_map */
1722 NULL, /* ident_map */
1723 1, /* strxfrm_multiply */
1724 1, /* caseup_multiply */
1725 1, /* casedn_multiply */
1726 2, /* mbminlen */
1727 4, /* mbmaxlen */
1728 0, /* min_sort_char */
1729 0xFFFF, /* max_sort_char */
1730 ' ', /* pad char */
1731 0, /* escape_with_backslash_is_dangerous */
1732 1, /* levels_for_order */
1733 &my_charset_utf16_handler,
1734 &my_collation_utf16_nopad_bin_handler
1735 };
1736
1737
1738 #define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b1))
1739 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3))
1740
1741 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci
1742 #define DEFINE_STRNXFRM_UNICODE
1743 #define DEFINE_STRNXFRM_UNICODE_NOPAD
1744 #define MY_MB_WC(cs, pwc, s, e) (my_ci_mb_wc(cs, pwc, s, e))
1745 #define OPTIMIZE_ASCII 0
1746 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
1747 #define UNICASE_PAGE0 my_unicase_default_page00
1748 #define UNICASE_PAGES my_unicase_default_pages
1749 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1750 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
1751 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1752 #include "strcoll.inl"
1753
1754 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_bin
1755 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1756 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0))
1757 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
1758 #include "strcoll.inl"
1759
1760 #define DEFINE_STRNNCOLLSP_NOPAD
1761 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_nopad_ci
1762 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1763 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
1764 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1765 #include "strcoll.inl"
1766
1767 #define DEFINE_STRNNCOLLSP_NOPAD
1768 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_nopad_bin
1769 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1770 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0))
1771 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
1772 #include "strcoll.inl"
1773
1774 #undef IS_MB2_CHAR
1775 #undef IS_MB4_CHAR
1776
1777 static int
my_utf16le_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)1778 my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)),
1779 my_wc_t *pwc, const uchar *s, const uchar *e)
1780 {
1781 my_wc_t lo;
1782
1783 if (s + 2 > e)
1784 return MY_CS_TOOSMALL2;
1785
1786 if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
1787 (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
1788 return 2; /* [0000-D7FF,E000-FFFF] */
1789
1790 if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
1791 return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
1792
1793 if (s + 4 > e)
1794 return MY_CS_TOOSMALL4;
1795
1796 s+= 2;
1797
1798 if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
1799 lo > MY_UTF16_SURROGATE_LOW_LAST)
1800 return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
1801
1802 *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
1803 return 4;
1804 }
1805
1806
1807 static int
my_uni_utf16le(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)1808 my_uni_utf16le(CHARSET_INFO *cs __attribute__((unused)),
1809 my_wc_t wc, uchar *s, uchar *e)
1810 {
1811 uint32 first, second, total;
1812 if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
1813 (wc > MY_UTF16_SURROGATE_LOW_LAST &&
1814 wc <= 0xFFFF))
1815 {
1816 if (s + 2 > e)
1817 return MY_CS_TOOSMALL2;
1818 int2store(s, wc);
1819 return 2; /* [0000-D7FF,E000-FFFF] */
1820 }
1821
1822 if (wc < 0xFFFF || wc > 0x10FFFF)
1823 return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
1824
1825 if (s + 4 > e)
1826 return MY_CS_TOOSMALL4;
1827
1828 wc-= 0x10000;
1829 first= (0xD800 | ((wc >> 10) & 0x3FF));
1830 second= (0xDC00 | (wc & 0x3FF));
1831 total= first | (second << 16);
1832 int4store(s, total);
1833 return 4; /* [010000-10FFFF] */
1834 }
1835
1836
1837 static size_t
my_lengthsp_utf16le(CHARSET_INFO * cs,const char * ptr,size_t length)1838 my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)),
1839 const char *ptr, size_t length)
1840 {
1841 const char *end= ptr + length;
1842 while (end > ptr + 1 && uint2korr(end - 2) == ' ')
1843 end-= 2;
1844 return (size_t) (end - ptr);
1845 }
1846
1847
1848 static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler =
1849 {
1850 NULL, /* init */
1851 my_strnncoll_utf16le_general_ci,
1852 my_strnncollsp_utf16le_general_ci,
1853 my_strnncollsp_nchars_utf16le_general_ci,
1854 my_strnxfrm_utf16le_general_ci,
1855 my_strnxfrmlen_unicode,
1856 my_like_range_generic,
1857 my_wildcmp_utf16_ci,
1858 my_strcasecmp_mb2_or_mb4,
1859 my_instr_mb,
1860 my_hash_sort_utf16,
1861 my_propagate_simple
1862 };
1863
1864
1865 static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler =
1866 {
1867 NULL, /* init */
1868 my_strnncoll_utf16le_bin,
1869 my_strnncollsp_utf16le_bin,
1870 my_strnncollsp_nchars_utf16le_bin,
1871 my_strnxfrm_unicode_full_bin,
1872 my_strnxfrmlen_unicode_full_bin,
1873 my_like_range_generic,
1874 my_wildcmp_utf16_bin,
1875 my_strcasecmp_mb2_or_mb4,
1876 my_instr_mb,
1877 my_hash_sort_utf16_bin,
1878 my_propagate_simple
1879 };
1880
1881
1882 static MY_COLLATION_HANDLER my_collation_utf16le_general_nopad_ci_handler =
1883 {
1884 NULL, /* init */
1885 my_strnncoll_utf16le_general_ci,
1886 my_strnncollsp_utf16le_general_nopad_ci,
1887 my_strnncollsp_nchars_utf16le_general_nopad_ci,
1888 my_strnxfrm_nopad_utf16le_general_ci,
1889 my_strnxfrmlen_unicode,
1890 my_like_range_generic,
1891 my_wildcmp_utf16_ci,
1892 my_strcasecmp_mb2_or_mb4,
1893 my_instr_mb,
1894 my_hash_sort_utf16_nopad,
1895 my_propagate_simple
1896 };
1897
1898
1899 static MY_COLLATION_HANDLER my_collation_utf16le_nopad_bin_handler =
1900 {
1901 NULL, /* init */
1902 my_strnncoll_utf16le_bin,
1903 my_strnncollsp_utf16le_nopad_bin,
1904 my_strnncollsp_nchars_utf16le_nopad_bin,
1905 my_strnxfrm_unicode_full_nopad_bin,
1906 my_strnxfrmlen_unicode_full_bin,
1907 my_like_range_generic,
1908 my_wildcmp_utf16_bin,
1909 my_strcasecmp_mb2_or_mb4,
1910 my_instr_mb,
1911 my_hash_sort_utf16_nopad_bin,
1912 my_propagate_simple
1913 };
1914
1915
1916 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
1917 {
1918 NULL, /* init */
1919 my_numchars_utf16,
1920 my_charpos_utf16,
1921 my_lengthsp_utf16le,
1922 my_numcells_mb,
1923 my_utf16le_uni, /* mb_wc */
1924 my_uni_utf16le, /* wc_mb */
1925 my_mb_ctype_mb,
1926 my_caseup_str_mb2_or_mb4,
1927 my_casedn_str_mb2_or_mb4,
1928 my_caseup_utf16,
1929 my_casedn_utf16,
1930 my_snprintf_mb2,
1931 my_l10tostr_mb2_or_mb4,
1932 my_ll10tostr_mb2_or_mb4,
1933 my_fill_mb2,
1934 my_strntol_mb2_or_mb4,
1935 my_strntoul_mb2_or_mb4,
1936 my_strntoll_mb2_or_mb4,
1937 my_strntoull_mb2_or_mb4,
1938 my_strntod_mb2_or_mb4,
1939 my_strtoll10_mb2,
1940 my_strntoull10rnd_mb2_or_mb4,
1941 my_scan_mb2,
1942 my_charlen_utf16,
1943 my_well_formed_char_length_utf16,
1944 my_copy_fix_mb2_or_mb4,
1945 my_uni_utf16le,
1946 my_wc_to_printable_generic
1947 };
1948
1949
1950 struct charset_info_st my_charset_utf16le_general_ci=
1951 {
1952 56,0,0, /* number */
1953 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1954 charset_name_utf16le, /* cs name */
1955 "utf16le_general_ci",/* name */
1956 "UTF-16LE Unicode", /* comment */
1957 NULL, /* tailoring */
1958 NULL, /* ctype */
1959 NULL, /* to_lower */
1960 NULL, /* to_upper */
1961 NULL, /* sort_order */
1962 NULL, /* uca */
1963 NULL, /* tab_to_uni */
1964 NULL, /* tab_from_uni */
1965 &my_unicase_default, /* caseinfo */
1966 NULL, /* state_map */
1967 NULL, /* ident_map */
1968 1, /* strxfrm_multiply */
1969 1, /* caseup_multiply */
1970 1, /* casedn_multiply */
1971 2, /* mbminlen */
1972 4, /* mbmaxlen */
1973 0, /* min_sort_char */
1974 0xFFFF, /* max_sort_char */
1975 ' ', /* pad char */
1976 0, /* escape_with_backslash_is_dangerous */
1977 1, /* levels_for_order */
1978 &my_charset_utf16le_handler,
1979 &my_collation_utf16le_general_ci_handler
1980 };
1981
1982
1983 struct charset_info_st my_charset_utf16le_bin=
1984 {
1985 62,0,0, /* number */
1986 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1987 charset_name_utf16le, /* cs name */
1988 "utf16le_bin", /* name */
1989 "UTF-16LE Unicode", /* comment */
1990 NULL, /* tailoring */
1991 NULL, /* ctype */
1992 NULL, /* to_lower */
1993 NULL, /* to_upper */
1994 NULL, /* sort_order */
1995 NULL, /* uca */
1996 NULL, /* tab_to_uni */
1997 NULL, /* tab_from_uni */
1998 &my_unicase_default, /* caseinfo */
1999 NULL, /* state_map */
2000 NULL, /* ident_map */
2001 1, /* strxfrm_multiply */
2002 1, /* caseup_multiply */
2003 1, /* casedn_multiply */
2004 2, /* mbminlen */
2005 4, /* mbmaxlen */
2006 0, /* min_sort_char */
2007 0xFFFF, /* max_sort_char */
2008 ' ', /* pad char */
2009 0, /* escape_with_backslash_is_dangerous */
2010 1, /* levels_for_order */
2011 &my_charset_utf16le_handler,
2012 &my_collation_utf16le_bin_handler
2013 };
2014
2015
2016 struct charset_info_st my_charset_utf16le_general_nopad_ci=
2017 {
2018 MY_NOPAD_ID(56),0,0, /* number */
2019 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
2020 charset_name_utf16le, /* cs name */
2021 "utf16le_general_nopad_ci",/* name */
2022 "UTF-16LE Unicode", /* comment */
2023 NULL, /* tailoring */
2024 NULL, /* ctype */
2025 NULL, /* to_lower */
2026 NULL, /* to_upper */
2027 NULL, /* sort_order */
2028 NULL, /* uca */
2029 NULL, /* tab_to_uni */
2030 NULL, /* tab_from_uni */
2031 &my_unicase_default, /* caseinfo */
2032 NULL, /* state_map */
2033 NULL, /* ident_map */
2034 1, /* strxfrm_multiply */
2035 1, /* caseup_multiply */
2036 1, /* casedn_multiply */
2037 2, /* mbminlen */
2038 4, /* mbmaxlen */
2039 0, /* min_sort_char */
2040 0xFFFF, /* max_sort_char */
2041 ' ', /* pad char */
2042 0, /* escape_with_backslash_is_dangerous */
2043 1, /* levels_for_order */
2044 &my_charset_utf16le_handler,
2045 &my_collation_utf16le_general_nopad_ci_handler
2046 };
2047
2048
2049 struct charset_info_st my_charset_utf16le_nopad_bin=
2050 {
2051 MY_NOPAD_ID(62),0,0, /* number */
2052 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
2053 MY_CS_NOPAD,
2054 charset_name_utf16le, /* cs name */
2055 "utf16le_nopad_bin", /* name */
2056 "UTF-16LE Unicode", /* comment */
2057 NULL, /* tailoring */
2058 NULL, /* ctype */
2059 NULL, /* to_lower */
2060 NULL, /* to_upper */
2061 NULL, /* sort_order */
2062 NULL, /* uca */
2063 NULL, /* tab_to_uni */
2064 NULL, /* tab_from_uni */
2065 &my_unicase_default, /* caseinfo */
2066 NULL, /* state_map */
2067 NULL, /* ident_map */
2068 1, /* strxfrm_multiply */
2069 1, /* caseup_multiply */
2070 1, /* casedn_multiply */
2071 2, /* mbminlen */
2072 4, /* mbmaxlen */
2073 0, /* min_sort_char */
2074 0xFFFF, /* max_sort_char */
2075 ' ', /* pad char */
2076 0, /* escape_with_backslash_is_dangerous */
2077 1, /* levels_for_order */
2078 &my_charset_utf16le_handler,
2079 &my_collation_utf16le_nopad_bin_handler
2080 };
2081
2082
2083 #endif /* HAVE_CHARSET_utf16 */
2084
2085
2086 #ifdef HAVE_CHARSET_utf32
2087
2088 #include "ctype-utf32.h"
2089
2090 /*
2091 Check is b0 and b1 start a valid UTF32 four-byte sequence.
2092 Don't accept characters greater than U+10FFFF.
2093 */
2094 #define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
2095
2096 #define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
2097
2098
my_weight_utf32_general_ci(uchar b0,uchar b1,uchar b2,uchar b3)2099 static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
2100 uchar b2, uchar b3)
2101 {
2102 my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
2103 if (wc <= 0xFFFF)
2104 {
2105 MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
2106 return (int) (page ? page[wc & 0xFF].sort : wc);
2107 }
2108 return MY_CS_REPLACEMENT_CHARACTER;
2109 }
2110 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
2111 #define DEFINE_STRNXFRM_UNICODE
2112 #define DEFINE_STRNXFRM_UNICODE_NOPAD
2113 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf32_quick(pwc, s, e)
2114 #define OPTIMIZE_ASCII 0
2115 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
2116 #define UNICASE_PAGE0 my_unicase_default_page00
2117 #define UNICASE_PAGES my_unicase_default_pages
2118 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2119 #define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
2120 #include "strcoll.inl"
2121
2122 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin
2123 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2124 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
2125 #include "strcoll.inl"
2126
2127 #define DEFINE_STRNNCOLLSP_NOPAD
2128 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_nopad_ci
2129 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2130 #define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
2131 #include "strcoll.inl"
2132
2133 #define DEFINE_STRNNCOLLSP_NOPAD
2134 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_nopad_bin
2135 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2136 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
2137 #include "strcoll.inl"
2138
2139 #undef IS_MB2_CHAR
2140 #undef IS_MB4_CHAR
2141
2142
2143 static int
my_utf32_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)2144 my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
2145 my_wc_t *pwc, const uchar *s, const uchar *e)
2146 {
2147 return my_mb_wc_utf32_quick(pwc, s, e);
2148 }
2149
2150
2151 static int
my_uni_utf32(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)2152 my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
2153 my_wc_t wc, uchar *s, uchar *e)
2154 {
2155 if (s + 4 > e)
2156 return MY_CS_TOOSMALL4;
2157
2158 if (wc > 0x10FFFF)
2159 return MY_CS_ILUNI;
2160
2161 s[0]= (uchar) (wc >> 24);
2162 s[1]= (uchar) (wc >> 16) & 0xFF;
2163 s[2]= (uchar) (wc >> 8) & 0xFF;
2164 s[3]= (uchar) wc & 0xFF;
2165 return 4;
2166 }
2167
2168
2169 static inline void
my_tolower_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2170 my_tolower_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2171 {
2172 MY_UNICASE_CHARACTER *page;
2173 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
2174 *wc= page[*wc & 0xFF].tolower;
2175 }
2176
2177
2178 static inline void
my_toupper_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2179 my_toupper_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2180 {
2181 MY_UNICASE_CHARACTER *page;
2182 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
2183 *wc= page[*wc & 0xFF].toupper;
2184 }
2185
2186
2187 static inline void
my_tosort_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2188 my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2189 {
2190 if (*wc <= uni_plane->maxchar)
2191 {
2192 MY_UNICASE_CHARACTER *page;
2193 if ((page= uni_plane->page[*wc >> 8]))
2194 *wc= page[*wc & 0xFF].sort;
2195 }
2196 else
2197 {
2198 *wc= MY_CS_REPLACEMENT_CHARACTER;
2199 }
2200 }
2201
2202
2203 static size_t
my_lengthsp_utf32(CHARSET_INFO * cs,const char * ptr,size_t length)2204 my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
2205 const char *ptr, size_t length)
2206 {
2207 const char *end= ptr + length;
2208 DBUG_ASSERT((length % 4) == 0);
2209 while (end > ptr + 3 && end[-1] == ' ' && !end[-2] && !end[-3] && !end[-4])
2210 end-= 4;
2211 return (size_t) (end - ptr);
2212 }
2213
2214
2215 static size_t
my_caseup_utf32(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)2216 my_caseup_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
2217 char *dst, size_t dstlen)
2218 {
2219 my_wc_t wc;
2220 int res;
2221 const char *srcend= src + srclen;
2222 char *dstend= dst + dstlen;
2223 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2224 DBUG_ASSERT(srclen <= dstlen);
2225
2226 while ((src < srcend) &&
2227 (res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
2228 {
2229 my_toupper_utf32(uni_plane, &wc);
2230 if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
2231 break;
2232 src+= res;
2233 dst+= res;
2234 }
2235 return srclen;
2236 }
2237
2238
2239 static void
my_hash_sort_utf32_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)2240 my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
2241 ulong *nr1, ulong *nr2)
2242 {
2243 my_wc_t wc;
2244 int res;
2245 const uchar *e= s + slen;
2246 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2247 register ulong m1= *nr1, m2= *nr2;
2248
2249 while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
2250 {
2251 my_tosort_utf32(uni_plane, &wc);
2252 MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
2253 MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
2254 MY_HASH_ADD(m1, m2, (uint) (wc >> 8) & 0xFF);
2255 MY_HASH_ADD(m1, m2, (uint) (wc & 0xFF));
2256 s+= res;
2257 }
2258 *nr1= m1;
2259 *nr2= m2;
2260 }
2261
2262
2263 static void
my_hash_sort_utf32(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)2264 my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen,
2265 ulong *nr1, ulong *nr2)
2266 {
2267 size_t lengthsp= my_lengthsp_utf32(cs, (const char *) s, slen);
2268 my_hash_sort_utf32_nopad(cs, s, lengthsp, nr1, nr2);
2269 }
2270
2271
2272 static size_t
my_casedn_utf32(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)2273 my_casedn_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
2274 char *dst, size_t dstlen)
2275 {
2276 my_wc_t wc;
2277 int res;
2278 const char *srcend= src + srclen;
2279 char *dstend= dst + dstlen;
2280 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2281 DBUG_ASSERT(srclen <= dstlen);
2282
2283 while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
2284 {
2285 my_tolower_utf32(uni_plane,&wc);
2286 if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
2287 break;
2288 src+= res;
2289 dst+= res;
2290 }
2291 return srclen;
2292 }
2293
2294
2295 static int
my_charlen_utf32(CHARSET_INFO * cs,const uchar * b,const uchar * e)2296 my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
2297 const uchar *b, const uchar *e)
2298 {
2299 return b + 4 > e ? MY_CS_TOOSMALL4 :
2300 IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ;
2301 }
2302
2303
2304 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32
2305 #define CHARLEN(cs,str,end) my_charlen_utf32(cs,str,end)
2306 #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
2307 #include "ctype-mb.inl"
2308 #undef MY_FUNCTION_NAME
2309 #undef CHARLEN
2310 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
2311 /* Defines my_well_formed_char_length_utf32 */
2312
2313
2314 static size_t
my_vsnprintf_utf32(char * dst,size_t n,const char * fmt,va_list ap)2315 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
2316 {
2317 char *start= dst, *end= dst + n;
2318 DBUG_ASSERT((n % 4) == 0);
2319 for (; *fmt ; fmt++)
2320 {
2321 if (fmt[0] != '%')
2322 {
2323 if (dst >= end) /* End of buffer */
2324 break;
2325
2326 *dst++= '\0';
2327 *dst++= '\0';
2328 *dst++= '\0';
2329 *dst++= *fmt; /* Copy ordinary char */
2330 continue;
2331 }
2332
2333 fmt++;
2334
2335 /* Skip if max size is used (to be compatible with printf) */
2336 while ( (*fmt>='0' && *fmt<='9') || *fmt == '.' || *fmt == '-')
2337 fmt++;
2338
2339 if (*fmt == 'l')
2340 fmt++;
2341
2342 if (*fmt == 's') /* String parameter */
2343 {
2344 reg2 char *par= va_arg(ap, char *);
2345 size_t plen;
2346 size_t left_len= (size_t)(end - dst);
2347 if (!par) par= (char*)"(null)";
2348 plen= strlen(par);
2349 if (left_len <= plen*4)
2350 plen= left_len / 4 - 1;
2351
2352 for ( ; plen ; plen--, dst+= 4, par++)
2353 {
2354 dst[0]= '\0';
2355 dst[1]= '\0';
2356 dst[2]= '\0';
2357 dst[3]= par[0];
2358 }
2359 continue;
2360 }
2361 else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
2362 {
2363 register int iarg;
2364 char nbuf[16];
2365 char *pbuf= nbuf;
2366
2367 if ((size_t) (end - dst) < 64)
2368 break;
2369 iarg= va_arg(ap, int);
2370 if (*fmt == 'd')
2371 int10_to_str((long) iarg, nbuf, -10);
2372 else
2373 int10_to_str((long) (uint) iarg,nbuf,10);
2374
2375 for (; pbuf[0]; pbuf++)
2376 {
2377 *dst++= '\0';
2378 *dst++= '\0';
2379 *dst++= '\0';
2380 *dst++= *pbuf;
2381 }
2382 continue;
2383 }
2384
2385 /* We come here on '%%', unknown code or too long parameter */
2386 if (dst == end)
2387 break;
2388 *dst++= '\0';
2389 *dst++= '\0';
2390 *dst++= '\0';
2391 *dst++= '%'; /* % used as % or unknown code */
2392 }
2393
2394 DBUG_ASSERT(dst < end);
2395 *dst++= '\0';
2396 *dst++= '\0';
2397 *dst++= '\0';
2398 *dst++= '\0'; /* End of errmessage */
2399 return (size_t) (dst - start - 4);
2400 }
2401
2402
2403 static size_t
my_snprintf_utf32(CHARSET_INFO * cs,char * to,size_t n,const char * fmt,...)2404 my_snprintf_utf32(CHARSET_INFO *cs __attribute__((unused)),
2405 char* to, size_t n, const char* fmt, ...)
2406 {
2407 size_t ret;
2408 va_list args;
2409 va_start(args,fmt);
2410 ret= my_vsnprintf_utf32(to, n, fmt, args);
2411 va_end(args);
2412 return ret;
2413 }
2414
2415
2416 static longlong
my_strtoll10_utf32(CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)2417 my_strtoll10_utf32(CHARSET_INFO *cs __attribute__((unused)),
2418 const char *nptr, char **endptr, int *error)
2419 {
2420 const char *s, *end, *start, *n_end, *true_end;
2421 uchar c;
2422 unsigned long i, j, k;
2423 ulonglong li;
2424 int negative;
2425 ulong cutoff, cutoff2, cutoff3;
2426
2427 s= nptr;
2428 /* If fixed length string */
2429 if (endptr)
2430 {
2431 /* Make sure string length is even */
2432 end= s + ((*endptr - s) / 4) * 4;
2433 while (s < end && !s[0] && !s[1] && !s[2] &&
2434 (s[3] == ' ' || s[3] == '\t'))
2435 s+= 4;
2436 if (s == end)
2437 goto no_conv;
2438 }
2439 else
2440 {
2441 /* We don't support null terminated strings in UCS2 */
2442 goto no_conv;
2443 }
2444
2445 /* Check for a sign. */
2446 negative= 0;
2447 if (!s[0] && !s[1] && !s[2] && s[3] == '-')
2448 {
2449 *error= -1; /* Mark as negative number */
2450 negative= 1;
2451 s+= 4;
2452 if (s == end)
2453 goto no_conv;
2454 cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
2455 cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
2456 cutoff3= MAX_NEGATIVE_NUMBER % 100;
2457 }
2458 else
2459 {
2460 *error= 0;
2461 if (!s[0] && !s[1] && !s[2] && s[3] == '+')
2462 {
2463 s+= 4;
2464 if (s == end)
2465 goto no_conv;
2466 }
2467 cutoff= ULONGLONG_MAX / LFACTOR2;
2468 cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
2469 cutoff3= ULONGLONG_MAX % 100;
2470 }
2471
2472 /* Handle case where we have a lot of pre-zero */
2473 if (!s[0] && !s[1] && !s[2] && s[3] == '0')
2474 {
2475 i= 0;
2476 do
2477 {
2478 s+= 4;
2479 if (s == end)
2480 goto end_i; /* Return 0 */
2481 }
2482 while (!s[0] && !s[1] && !s[2] && s[3] == '0');
2483 n_end= s + 4 * INIT_CNT;
2484 }
2485 else
2486 {
2487 /* Read first digit to check that it's a valid number */
2488 if (s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2489 goto no_conv;
2490 i= c;
2491 s+= 4;
2492 n_end= s + 4 * (INIT_CNT-1);
2493 }
2494
2495 /* Handle first 9 digits and store them in i */
2496 if (n_end > end)
2497 n_end= end;
2498 for (; s != n_end ; s+= 4)
2499 {
2500 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2501 goto end_i;
2502 i= i * 10 + c;
2503 }
2504 if (s == end)
2505 goto end_i;
2506
2507 /* Handle next 9 digits and store them in j */
2508 j= 0;
2509 start= s; /* Used to know how much to shift i */
2510 n_end= true_end= s + 4 * INIT_CNT;
2511 if (n_end > end)
2512 n_end= end;
2513 do
2514 {
2515 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2516 goto end_i_and_j;
2517 j= j * 10 + c;
2518 s+= 4;
2519 } while (s != n_end);
2520 if (s == end)
2521 {
2522 if (s != true_end)
2523 goto end_i_and_j;
2524 goto end3;
2525 }
2526 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2527 goto end3;
2528
2529 /* Handle the next 1 or 2 digits and store them in k */
2530 k=c;
2531 s+= 4;
2532 if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2533 goto end4;
2534 k= k * 10 + c;
2535 s+= 4;
2536 *endptr= (char*) s;
2537
2538 /* number string should have ended here */
2539 if (s != end && !s[0] && !s[1] && !s[2] && (c= (s[3] - '0')) <= 9)
2540 goto overflow;
2541
2542 /* Check that we didn't get an overflow with the last digit */
2543 if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
2544 k > cutoff3)))
2545 goto overflow;
2546 li= i * LFACTOR2+ (ulonglong) j * 100 + k;
2547 return (longlong) li;
2548
2549 overflow: /* *endptr is set here */
2550 *error= MY_ERRNO_ERANGE;
2551 return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
2552
2553 end_i:
2554 *endptr= (char*) s;
2555 return (negative ? ((longlong) -(long) i) : (longlong) i);
2556
2557 end_i_and_j:
2558 li= (ulonglong) i * lfactor[(size_t) (s-start) / 4] + j;
2559 *endptr= (char*) s;
2560 return (negative ? -((longlong) li) : (longlong) li);
2561
2562 end3:
2563 li= (ulonglong) i*LFACTOR+ (ulonglong) j;
2564 *endptr= (char*) s;
2565 return (negative ? -((longlong) li) : (longlong) li);
2566
2567 end4:
2568 li= (ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
2569 *endptr= (char*) s;
2570 if (negative)
2571 {
2572 if (li > MAX_NEGATIVE_NUMBER)
2573 goto overflow;
2574 return -((longlong) li);
2575 }
2576 return (longlong) li;
2577
2578 no_conv:
2579 /* There was no number to convert. */
2580 *error= MY_ERRNO_EDOM;
2581 *endptr= (char *) nptr;
2582 return 0;
2583 }
2584
2585
2586 static size_t
my_numchars_utf32(CHARSET_INFO * cs,const char * b,const char * e)2587 my_numchars_utf32(CHARSET_INFO *cs __attribute__((unused)),
2588 const char *b, const char *e)
2589 {
2590 return (size_t) (e - b) / 4;
2591 }
2592
2593
2594 static size_t
my_charpos_utf32(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)2595 my_charpos_utf32(CHARSET_INFO *cs __attribute__((unused)),
2596 const char *b, const char *e, size_t pos)
2597 {
2598 size_t string_length= (size_t) (e - b);
2599 return pos * 4 > string_length ? string_length + 4 : pos * 4;
2600 }
2601
2602
2603 static
my_fill_utf32(CHARSET_INFO * cs,char * s,size_t slen,int fill)2604 void my_fill_utf32(CHARSET_INFO *cs,
2605 char *s, size_t slen, int fill)
2606 {
2607 char buf[10];
2608 #ifdef DBUG_ASSERT_EXISTS
2609 uint buflen;
2610 #endif
2611 char *e= s + slen;
2612
2613 DBUG_ASSERT((slen % 4) == 0);
2614
2615 #ifdef DBUG_ASSERT_EXISTS
2616 buflen=
2617 #endif
2618 my_ci_wc_mb(cs, (my_wc_t) fill, (uchar*) buf, (uchar*) buf + sizeof(buf));
2619 DBUG_ASSERT(buflen == 4);
2620 while (s < e)
2621 {
2622 memcpy(s, buf, 4);
2623 s+= 4;
2624 }
2625 }
2626
2627
2628 static int
my_wildcmp_utf32_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2629 my_wildcmp_utf32_ci(CHARSET_INFO *cs,
2630 const char *str, const char *str_end,
2631 const char *wildstr, const char *wildend,
2632 int escape, int w_one, int w_many)
2633 {
2634 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2635 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2636 escape, w_one, w_many, uni_plane);
2637 }
2638
2639
2640 static int
my_wildcmp_utf32_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2641 my_wildcmp_utf32_bin(CHARSET_INFO *cs,
2642 const char *str,const char *str_end,
2643 const char *wildstr,const char *wildend,
2644 int escape, int w_one, int w_many)
2645 {
2646 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2647 escape, w_one, w_many, NULL);
2648 }
2649
2650
2651 static size_t
my_scan_utf32(CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)2652 my_scan_utf32(CHARSET_INFO *cs,
2653 const char *str, const char *end, int sequence_type)
2654 {
2655 const char *str0= str;
2656
2657 switch (sequence_type)
2658 {
2659 case MY_SEQ_SPACES:
2660 for ( ; str < end; )
2661 {
2662 my_wc_t wc;
2663 int res= my_utf32_uni(cs, &wc, (uchar*) str, (uchar*) end);
2664 if (res < 0 || wc != ' ')
2665 break;
2666 str+= res;
2667 }
2668 return (size_t) (str - str0);
2669 case MY_SEQ_NONSPACES:
2670 DBUG_ASSERT(0); /* Not implemented */
2671 /* pass through */
2672 default:
2673 return 0;
2674 }
2675 }
2676
2677
2678 static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
2679 {
2680 NULL, /* init */
2681 my_strnncoll_utf32_general_ci,
2682 my_strnncollsp_utf32_general_ci,
2683 my_strnncollsp_nchars_utf32_general_ci,
2684 my_strnxfrm_utf32_general_ci,
2685 my_strnxfrmlen_unicode,
2686 my_like_range_generic,
2687 my_wildcmp_utf32_ci,
2688 my_strcasecmp_mb2_or_mb4,
2689 my_instr_mb,
2690 my_hash_sort_utf32,
2691 my_propagate_simple
2692 };
2693
2694
2695 static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
2696 {
2697 NULL, /* init */
2698 my_strnncoll_utf32_bin,
2699 my_strnncollsp_utf32_bin,
2700 my_strnncollsp_nchars_utf32_bin,
2701 my_strnxfrm_unicode_full_bin,
2702 my_strnxfrmlen_unicode_full_bin,
2703 my_like_range_generic,
2704 my_wildcmp_utf32_bin,
2705 my_strcasecmp_mb2_or_mb4,
2706 my_instr_mb,
2707 my_hash_sort_utf32,
2708 my_propagate_simple
2709 };
2710
2711
2712 static MY_COLLATION_HANDLER my_collation_utf32_general_nopad_ci_handler =
2713 {
2714 NULL, /* init */
2715 my_strnncoll_utf32_general_ci,
2716 my_strnncollsp_utf32_general_nopad_ci,
2717 my_strnncollsp_nchars_utf32_general_nopad_ci,
2718 my_strnxfrm_nopad_utf32_general_ci,
2719 my_strnxfrmlen_unicode,
2720 my_like_range_generic,
2721 my_wildcmp_utf32_ci,
2722 my_strcasecmp_mb2_or_mb4,
2723 my_instr_mb,
2724 my_hash_sort_utf32_nopad,
2725 my_propagate_simple
2726 };
2727
2728
2729 static MY_COLLATION_HANDLER my_collation_utf32_nopad_bin_handler =
2730 {
2731 NULL, /* init */
2732 my_strnncoll_utf32_bin,
2733 my_strnncollsp_utf32_nopad_bin,
2734 my_strnncollsp_nchars_utf32_nopad_bin,
2735 my_strnxfrm_unicode_full_nopad_bin,
2736 my_strnxfrmlen_unicode_full_bin,
2737 my_like_range_generic,
2738 my_wildcmp_utf32_bin,
2739 my_strcasecmp_mb2_or_mb4,
2740 my_instr_mb,
2741 my_hash_sort_utf32_nopad,
2742 my_propagate_simple
2743 };
2744
2745
2746 MY_CHARSET_HANDLER my_charset_utf32_handler=
2747 {
2748 NULL, /* init */
2749 my_numchars_utf32,
2750 my_charpos_utf32,
2751 my_lengthsp_utf32,
2752 my_numcells_mb,
2753 my_utf32_uni,
2754 my_uni_utf32,
2755 my_mb_ctype_mb,
2756 my_caseup_str_mb2_or_mb4,
2757 my_casedn_str_mb2_or_mb4,
2758 my_caseup_utf32,
2759 my_casedn_utf32,
2760 my_snprintf_utf32,
2761 my_l10tostr_mb2_or_mb4,
2762 my_ll10tostr_mb2_or_mb4,
2763 my_fill_utf32,
2764 my_strntol_mb2_or_mb4,
2765 my_strntoul_mb2_or_mb4,
2766 my_strntoll_mb2_or_mb4,
2767 my_strntoull_mb2_or_mb4,
2768 my_strntod_mb2_or_mb4,
2769 my_strtoll10_utf32,
2770 my_strntoull10rnd_mb2_or_mb4,
2771 my_scan_utf32,
2772 my_charlen_utf32,
2773 my_well_formed_char_length_utf32,
2774 my_copy_fix_mb2_or_mb4,
2775 my_uni_utf32,
2776 my_wc_to_printable_generic
2777 };
2778
2779
2780 struct charset_info_st my_charset_utf32_general_ci=
2781 {
2782 60,0,0, /* number */
2783 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2784 charset_name_utf32, /* cs name */
2785 "utf32_general_ci", /* name */
2786 "UTF-32 Unicode", /* comment */
2787 NULL, /* tailoring */
2788 NULL, /* ctype */
2789 NULL, /* to_lower */
2790 NULL, /* to_upper */
2791 NULL, /* sort_order */
2792 NULL, /* uca */
2793 NULL, /* tab_to_uni */
2794 NULL, /* tab_from_uni */
2795 &my_unicase_default, /* caseinfo */
2796 NULL, /* state_map */
2797 NULL, /* ident_map */
2798 1, /* strxfrm_multiply */
2799 1, /* caseup_multiply */
2800 1, /* casedn_multiply */
2801 4, /* mbminlen */
2802 4, /* mbmaxlen */
2803 0, /* min_sort_char */
2804 0xFFFF, /* max_sort_char */
2805 ' ', /* pad char */
2806 0, /* escape_with_backslash_is_dangerous */
2807 1, /* levels_for_order */
2808 &my_charset_utf32_handler,
2809 &my_collation_utf32_general_ci_handler
2810 };
2811
2812
2813 struct charset_info_st my_charset_utf32_bin=
2814 {
2815 61,0,0, /* number */
2816 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2817 charset_name_utf32, /* cs name */
2818 "utf32_bin", /* name */
2819 "UTF-32 Unicode", /* comment */
2820 NULL, /* tailoring */
2821 NULL, /* ctype */
2822 NULL, /* to_lower */
2823 NULL, /* to_upper */
2824 NULL, /* sort_order */
2825 NULL, /* uca */
2826 NULL, /* tab_to_uni */
2827 NULL, /* tab_from_uni */
2828 &my_unicase_default, /* caseinfo */
2829 NULL, /* state_map */
2830 NULL, /* ident_map */
2831 1, /* strxfrm_multiply */
2832 1, /* caseup_multiply */
2833 1, /* casedn_multiply */
2834 4, /* mbminlen */
2835 4, /* mbmaxlen */
2836 0, /* min_sort_char */
2837 0xFFFF, /* max_sort_char */
2838 ' ', /* pad char */
2839 0, /* escape_with_backslash_is_dangerous */
2840 1, /* levels_for_order */
2841 &my_charset_utf32_handler,
2842 &my_collation_utf32_bin_handler
2843 };
2844
2845
2846 struct charset_info_st my_charset_utf32_general_nopad_ci=
2847 {
2848 MY_NOPAD_ID(60),0,0, /* number */
2849 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
2850 charset_name_utf32, /* cs name */
2851 "utf32_general_nopad_ci", /* name */
2852 "UTF-32 Unicode", /* comment */
2853 NULL, /* tailoring */
2854 NULL, /* ctype */
2855 NULL, /* to_lower */
2856 NULL, /* to_upper */
2857 NULL, /* sort_order */
2858 NULL, /* uca */
2859 NULL, /* tab_to_uni */
2860 NULL, /* tab_from_uni */
2861 &my_unicase_default, /* caseinfo */
2862 NULL, /* state_map */
2863 NULL, /* ident_map */
2864 1, /* strxfrm_multiply */
2865 1, /* caseup_multiply */
2866 1, /* casedn_multiply */
2867 4, /* mbminlen */
2868 4, /* mbmaxlen */
2869 0, /* min_sort_char */
2870 0xFFFF, /* max_sort_char */
2871 ' ', /* pad char */
2872 0, /* escape_with_backslash_is_dangerous */
2873 1, /* levels_for_order */
2874 &my_charset_utf32_handler,
2875 &my_collation_utf32_general_nopad_ci_handler
2876 };
2877
2878
2879 struct charset_info_st my_charset_utf32_nopad_bin=
2880 {
2881 MY_NOPAD_ID(61),0,0, /* number */
2882 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
2883 MY_CS_NOPAD,
2884 charset_name_utf32, /* cs name */
2885 "utf32_nopad_bin", /* name */
2886 "UTF-32 Unicode", /* comment */
2887 NULL, /* tailoring */
2888 NULL, /* ctype */
2889 NULL, /* to_lower */
2890 NULL, /* to_upper */
2891 NULL, /* sort_order */
2892 NULL, /* uca */
2893 NULL, /* tab_to_uni */
2894 NULL, /* tab_from_uni */
2895 &my_unicase_default, /* caseinfo */
2896 NULL, /* state_map */
2897 NULL, /* ident_map */
2898 1, /* strxfrm_multiply */
2899 1, /* caseup_multiply */
2900 1, /* casedn_multiply */
2901 4, /* mbminlen */
2902 4, /* mbmaxlen */
2903 0, /* min_sort_char */
2904 0xFFFF, /* max_sort_char */
2905 ' ', /* pad char */
2906 0, /* escape_with_backslash_is_dangerous */
2907 1, /* levels_for_order */
2908 &my_charset_utf32_handler,
2909 &my_collation_utf32_nopad_bin_handler
2910 };
2911
2912
2913 #endif /* HAVE_CHARSET_utf32 */
2914
2915
2916 #ifdef HAVE_CHARSET_ucs2
2917
2918 #include "ctype-ucs2.h"
2919
2920 static const uchar ctype_ucs2[] = {
2921 0,
2922 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
2923 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2924 72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
2925 132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
2926 16,129,129,129,129,129,129, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16,
2928 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2929 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32,
2930 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2936 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2937 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2938 };
2939
2940 static const uchar to_lower_ucs2[] = {
2941 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2942 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2943 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2944 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2945 64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2946 112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
2947 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2948 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2949 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2950 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2951 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2952 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2953 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2954 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2955 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2956 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2957 };
2958
2959 static const uchar to_upper_ucs2[] = {
2960 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2961 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2962 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2963 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2964 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2965 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2966 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2967 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
2968 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2969 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2970 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2971 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2972 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2973 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2974 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2975 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2976 };
2977
2978
2979 /* Definitions for strcoll.inl */
2980 #define IS_MB2_CHAR(x,y) (1)
2981 #define UCS2_CODE(b0,b1) (((uchar) b0) << 8 | ((uchar) b1))
2982
2983
my_weight_mb2_ucs2_general_ci(uchar b0,uchar b1)2984 static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1)
2985 {
2986 my_wc_t wc= UCS2_CODE(b0, b1);
2987 MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
2988 return (int) (page ? page[wc & 0xFF].sort : wc);
2989 }
2990
2991
2992 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci
2993 #define DEFINE_STRNXFRM_UNICODE
2994 #define DEFINE_STRNXFRM_UNICODE_NOPAD
2995 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e)
2996 #define OPTIMIZE_ASCII 0
2997 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
2998 #define UNICASE_PAGE0 my_unicase_default_page00
2999 #define UNICASE_PAGES my_unicase_default_pages
3000 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3001 #define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
3002 #include "strcoll.inl"
3003
3004
3005 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin
3006 #define DEFINE_STRNXFRM_UNICODE_BIN2
3007 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e)
3008 #define OPTIMIZE_ASCII 0
3009 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3010 #define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
3011 #include "strcoll.inl"
3012
3013
3014 #define DEFINE_STRNNCOLLSP_NOPAD
3015 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_nopad_ci
3016 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3017 #define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
3018 #include "strcoll.inl"
3019
3020
3021 #define DEFINE_STRNNCOLLSP_NOPAD
3022 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_nopad_bin
3023 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3024 #define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
3025 #include "strcoll.inl"
3026
3027
3028 static int
my_charlen_ucs2(CHARSET_INFO * cs,const uchar * s,const uchar * e)3029 my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3030 const uchar *s, const uchar *e)
3031 {
3032 return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2;
3033 }
3034
3035
my_ucs2_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)3036 static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
3037 my_wc_t * pwc, const uchar *s, const uchar *e)
3038 {
3039 return my_mb_wc_ucs2_quick(pwc, s, e);
3040 }
3041
my_uni_ucs2(CHARSET_INFO * cs,my_wc_t wc,uchar * r,uchar * e)3042 static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
3043 my_wc_t wc, uchar *r, uchar *e)
3044 {
3045 if ( r+2 > e )
3046 return MY_CS_TOOSMALL2;
3047
3048 if (wc > 0xFFFF) /* UCS2 does not support characters outside BMP */
3049 return MY_CS_ILUNI;
3050
3051 r[0]= (uchar) (wc >> 8);
3052 r[1]= (uchar) (wc & 0xFF);
3053 return 2;
3054 }
3055
3056
3057 static inline void
my_tolower_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3058 my_tolower_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3059 {
3060 MY_UNICASE_CHARACTER *page;
3061 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3062 *wc= page[*wc & 0xFF].tolower;
3063 }
3064
3065
3066 static inline void
my_toupper_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3067 my_toupper_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3068 {
3069 MY_UNICASE_CHARACTER *page;
3070 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3071 *wc= page[*wc & 0xFF].toupper;
3072 }
3073
3074
3075 static inline void
my_tosort_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3076 my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3077 {
3078 MY_UNICASE_CHARACTER *page;
3079 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3080 *wc= page[*wc & 0xFF].sort;
3081 }
3082
my_caseup_ucs2(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)3083 static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
3084 char *dst, size_t dstlen)
3085 {
3086 my_wc_t wc;
3087 int res;
3088 const char *srcend= src + srclen;
3089 char *dstend= dst + dstlen;
3090 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3091 DBUG_ASSERT(srclen <= dstlen);
3092
3093 while ((src < srcend) &&
3094 (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
3095 {
3096 my_toupper_ucs2(uni_plane, &wc);
3097 if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
3098 break;
3099 src+= res;
3100 dst+= res;
3101 }
3102 return srclen;
3103 }
3104
3105
3106 static void
my_hash_sort_ucs2_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)3107 my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
3108 ulong *nr1, ulong *nr2)
3109 {
3110 my_wc_t wc;
3111 int res;
3112 const uchar *e=s+slen;
3113 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3114 register ulong m1= *nr1, m2= *nr2;
3115
3116 while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
3117 {
3118 my_tosort_ucs2(uni_plane, &wc);
3119 MY_HASH_ADD_16(m1, m2, wc);
3120 s+=res;
3121 }
3122 *nr1= m1;
3123 *nr2= m2;
3124 }
3125
3126
my_hash_sort_ucs2(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)3127 static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
3128 ulong *nr1, ulong *nr2)
3129 {
3130 size_t lengthsp= my_lengthsp_mb2(cs, (const char *) s, slen);
3131 my_hash_sort_ucs2_nopad(cs, s, lengthsp, nr1, nr2);
3132 }
3133
my_casedn_ucs2(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)3134 static size_t my_casedn_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
3135 char *dst, size_t dstlen)
3136 {
3137 my_wc_t wc;
3138 int res;
3139 const char *srcend= src + srclen;
3140 char *dstend= dst + dstlen;
3141 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3142 DBUG_ASSERT(srclen <= dstlen);
3143
3144 while ((src < srcend) &&
3145 (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
3146 {
3147 my_tolower_ucs2(uni_plane, &wc);
3148 if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
3149 break;
3150 src+= res;
3151 dst+= res;
3152 }
3153 return srclen;
3154 }
3155
3156
3157 static void
my_fill_ucs2(CHARSET_INFO * cs,char * s,size_t l,int fill)3158 my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3159 char *s, size_t l, int fill)
3160 {
3161 DBUG_ASSERT(fill <= 0xFFFF);
3162 #ifdef WAITING_FOR_GCC_VECTORIZATION_BUG_TO_BE_FIXED
3163 /*
3164 This code with int2store() is known to be faster on some processors,
3165 but crashes on other processors due to a possible bug in GCC's
3166 -ftree-vectorization (which is enabled in -O3) in case of
3167 a non-aligned memory. See here for details:
3168 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58039
3169 */
3170 char *last= s + l - 2;
3171 uint16 tmp= (fill >> 8) + ((fill & 0xFF) << 8); /* swap bytes */
3172 DBUG_ASSERT(fill <= 0xFFFF);
3173 for ( ; s <= last; s+= 2)
3174 int2store(s, tmp); /* store little-endian */
3175 #else
3176 for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
3177 #endif
3178 }
3179
3180
3181 static
my_numchars_ucs2(CHARSET_INFO * cs,const char * b,const char * e)3182 size_t my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3183 const char *b, const char *e)
3184 {
3185 return (size_t) (e-b)/2;
3186 }
3187
3188
3189 static
my_charpos_ucs2(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)3190 size_t my_charpos_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3191 const char *b __attribute__((unused)),
3192 const char *e __attribute__((unused)),
3193 size_t pos)
3194 {
3195 size_t string_length= (size_t) (e - b);
3196 return pos > string_length ? string_length + 2 : pos * 2;
3197 }
3198
3199
3200 static size_t
my_well_formed_char_length_ucs2(CHARSET_INFO * cs,const char * b,const char * e,size_t nchars,MY_STRCOPY_STATUS * status)3201 my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3202 const char *b, const char *e,
3203 size_t nchars, MY_STRCOPY_STATUS *status)
3204 {
3205 size_t length= e - b;
3206 if (nchars * 2 <= length)
3207 {
3208 status->m_well_formed_error_pos= NULL;
3209 status->m_source_end_pos= b + (nchars * 2);
3210 return nchars;
3211 }
3212 if (length % 2)
3213 {
3214 status->m_well_formed_error_pos= status->m_source_end_pos= e - 1;
3215 }
3216 else
3217 {
3218 status->m_well_formed_error_pos= NULL;
3219 status->m_source_end_pos= e;
3220 }
3221 return length / 2;
3222 }
3223
3224
3225 static
my_wildcmp_ucs2_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3226 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
3227 const char *str,const char *str_end,
3228 const char *wildstr,const char *wildend,
3229 int escape, int w_one, int w_many)
3230 {
3231 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3232 return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3233 escape,w_one,w_many,uni_plane);
3234 }
3235
3236
3237 static
my_wildcmp_ucs2_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3238 int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
3239 const char *str,const char *str_end,
3240 const char *wildstr,const char *wildend,
3241 int escape, int w_one, int w_many)
3242 {
3243 return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3244 escape,w_one,w_many,NULL);
3245 }
3246
3247
3248 static void
my_hash_sort_ucs2_nopad_bin(CHARSET_INFO * cs,const uchar * key,size_t len,ulong * nr1,ulong * nr2)3249 my_hash_sort_ucs2_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
3250 const uchar *key, size_t len,
3251 ulong *nr1, ulong *nr2)
3252 {
3253 const uchar *end= key + len;
3254 register ulong m1= *nr1, m2= *nr2;
3255 for ( ; key < end ; key++)
3256 {
3257 MY_HASH_ADD(m1, m2, (uint)*key);
3258 }
3259 *nr1= m1;
3260 *nr2= m2;
3261 }
3262
3263
3264 static void
my_hash_sort_ucs2_bin(CHARSET_INFO * cs,const uchar * key,size_t len,ulong * nr1,ulong * nr2)3265 my_hash_sort_ucs2_bin(CHARSET_INFO *cs,
3266 const uchar *key, size_t len, ulong *nr1, ulong *nr2)
3267 {
3268 size_t lengthsp= my_lengthsp_mb2(cs, (const char *) key, len);
3269 my_hash_sort_ucs2_nopad_bin(cs, key, lengthsp, nr1, nr2);
3270 }
3271
3272
3273 static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
3274 {
3275 NULL, /* init */
3276 my_strnncoll_ucs2_general_ci,
3277 my_strnncollsp_ucs2_general_ci,
3278 my_strnncollsp_nchars_ucs2_general_ci,
3279 my_strnxfrm_ucs2_general_ci,
3280 my_strnxfrmlen_unicode,
3281 my_like_range_generic,
3282 my_wildcmp_ucs2_ci,
3283 my_strcasecmp_mb2_or_mb4,
3284 my_instr_mb,
3285 my_hash_sort_ucs2,
3286 my_propagate_simple
3287 };
3288
3289
3290 static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
3291 {
3292 NULL, /* init */
3293 my_strnncoll_ucs2_bin,
3294 my_strnncollsp_ucs2_bin,
3295 my_strnncollsp_nchars_ucs2_bin,
3296 my_strnxfrm_ucs2_bin,
3297 my_strnxfrmlen_unicode,
3298 my_like_range_generic,
3299 my_wildcmp_ucs2_bin,
3300 my_strcasecmp_mb2_or_mb4,
3301 my_instr_mb,
3302 my_hash_sort_ucs2_bin,
3303 my_propagate_simple
3304 };
3305
3306
3307 static MY_COLLATION_HANDLER my_collation_ucs2_general_nopad_ci_handler =
3308 {
3309 NULL, /* init */
3310 my_strnncoll_ucs2_general_ci,
3311 my_strnncollsp_ucs2_general_nopad_ci,
3312 my_strnncollsp_nchars_ucs2_general_nopad_ci,
3313 my_strnxfrm_nopad_ucs2_general_ci,
3314 my_strnxfrmlen_unicode,
3315 my_like_range_generic,
3316 my_wildcmp_ucs2_ci,
3317 my_strcasecmp_mb2_or_mb4,
3318 my_instr_mb,
3319 my_hash_sort_ucs2_nopad,
3320 my_propagate_simple
3321 };
3322
3323
3324 static MY_COLLATION_HANDLER my_collation_ucs2_nopad_bin_handler =
3325 {
3326 NULL, /* init */
3327 my_strnncoll_ucs2_bin,
3328 my_strnncollsp_ucs2_nopad_bin,
3329 my_strnncollsp_nchars_ucs2_nopad_bin,
3330 my_strnxfrm_nopad_ucs2_bin,
3331 my_strnxfrmlen_unicode,
3332 my_like_range_generic,
3333 my_wildcmp_ucs2_bin,
3334 my_strcasecmp_mb2_or_mb4,
3335 my_instr_mb,
3336 my_hash_sort_ucs2_nopad_bin,
3337 my_propagate_simple
3338 };
3339
3340
3341 MY_CHARSET_HANDLER my_charset_ucs2_handler=
3342 {
3343 NULL, /* init */
3344 my_numchars_ucs2,
3345 my_charpos_ucs2,
3346 my_lengthsp_mb2,
3347 my_numcells_mb,
3348 my_ucs2_uni, /* mb_wc */
3349 my_uni_ucs2, /* wc_mb */
3350 my_mb_ctype_mb,
3351 my_caseup_str_mb2_or_mb4,
3352 my_casedn_str_mb2_or_mb4,
3353 my_caseup_ucs2,
3354 my_casedn_ucs2,
3355 my_snprintf_mb2,
3356 my_l10tostr_mb2_or_mb4,
3357 my_ll10tostr_mb2_or_mb4,
3358 my_fill_ucs2,
3359 my_strntol_mb2_or_mb4,
3360 my_strntoul_mb2_or_mb4,
3361 my_strntoll_mb2_or_mb4,
3362 my_strntoull_mb2_or_mb4,
3363 my_strntod_mb2_or_mb4,
3364 my_strtoll10_mb2,
3365 my_strntoull10rnd_mb2_or_mb4,
3366 my_scan_mb2,
3367 my_charlen_ucs2,
3368 my_well_formed_char_length_ucs2,
3369 my_copy_fix_mb2_or_mb4,
3370 my_uni_ucs2,
3371 my_wc_to_printable_generic
3372 };
3373
3374
3375 struct charset_info_st my_charset_ucs2_general_ci=
3376 {
3377 35,0,0, /* number */
3378 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
3379 charset_name_ucs2, /* cs name */
3380 "ucs2_general_ci", /* name */
3381 "", /* comment */
3382 NULL, /* tailoring */
3383 ctype_ucs2, /* ctype */
3384 to_lower_ucs2, /* to_lower */
3385 to_upper_ucs2, /* to_upper */
3386 to_upper_ucs2, /* sort_order */
3387 NULL, /* uca */
3388 NULL, /* tab_to_uni */
3389 NULL, /* tab_from_uni */
3390 &my_unicase_default,/* caseinfo */
3391 NULL, /* state_map */
3392 NULL, /* ident_map */
3393 1, /* strxfrm_multiply */
3394 1, /* caseup_multiply */
3395 1, /* casedn_multiply */
3396 2, /* mbminlen */
3397 2, /* mbmaxlen */
3398 0, /* min_sort_char */
3399 0xFFFF, /* max_sort_char */
3400 ' ', /* pad char */
3401 0, /* escape_with_backslash_is_dangerous */
3402 1, /* levels_for_order */
3403 &my_charset_ucs2_handler,
3404 &my_collation_ucs2_general_ci_handler
3405 };
3406
3407
3408 struct charset_info_st my_charset_ucs2_general_mysql500_ci=
3409 {
3410 159, 0, 0, /* number */
3411 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, /* state */
3412 charset_name_ucs2, /* cs name */
3413 "ucs2_general_mysql500_ci", /* name */
3414 "", /* comment */
3415 NULL, /* tailoring */
3416 ctype_ucs2, /* ctype */
3417 to_lower_ucs2, /* to_lower */
3418 to_upper_ucs2, /* to_upper */
3419 to_upper_ucs2, /* sort_order */
3420 NULL, /* uca */
3421 NULL, /* tab_to_uni */
3422 NULL, /* tab_from_uni */
3423 &my_unicase_mysql500, /* caseinfo */
3424 NULL, /* state_map */
3425 NULL, /* ident_map */
3426 1, /* strxfrm_multiply */
3427 1, /* caseup_multiply */
3428 1, /* casedn_multiply */
3429 2, /* mbminlen */
3430 2, /* mbmaxlen */
3431 0, /* min_sort_char */
3432 0xFFFF, /* max_sort_char */
3433 ' ', /* pad char */
3434 0, /* escape_with_backslash_is_dangerous */
3435 1, /* levels_for_order */
3436 &my_charset_ucs2_handler,
3437 &my_collation_ucs2_general_ci_handler
3438 };
3439
3440
3441 struct charset_info_st my_charset_ucs2_bin=
3442 {
3443 90,0,0, /* number */
3444 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
3445 charset_name_ucs2, /* cs name */
3446 "ucs2_bin", /* name */
3447 "", /* comment */
3448 NULL, /* tailoring */
3449 ctype_ucs2, /* ctype */
3450 to_lower_ucs2, /* to_lower */
3451 to_upper_ucs2, /* to_upper */
3452 NULL, /* sort_order */
3453 NULL, /* uca */
3454 NULL, /* tab_to_uni */
3455 NULL, /* tab_from_uni */
3456 &my_unicase_default,/* caseinfo */
3457 NULL, /* state_map */
3458 NULL, /* ident_map */
3459 1, /* strxfrm_multiply */
3460 1, /* caseup_multiply */
3461 1, /* casedn_multiply */
3462 2, /* mbminlen */
3463 2, /* mbmaxlen */
3464 0, /* min_sort_char */
3465 0xFFFF, /* max_sort_char */
3466 ' ', /* pad char */
3467 0, /* escape_with_backslash_is_dangerous */
3468 1, /* levels_for_order */
3469 &my_charset_ucs2_handler,
3470 &my_collation_ucs2_bin_handler
3471 };
3472
3473
3474 struct charset_info_st my_charset_ucs2_general_nopad_ci=
3475 {
3476 MY_NOPAD_ID(35),0,0, /* number */
3477 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
3478 charset_name_ucs2, /* cs name */
3479 "ucs2_general_nopad_ci", /* name */
3480 "", /* comment */
3481 NULL, /* tailoring */
3482 ctype_ucs2, /* ctype */
3483 to_lower_ucs2, /* to_lower */
3484 to_upper_ucs2, /* to_upper */
3485 to_upper_ucs2, /* sort_order */
3486 NULL, /* uca */
3487 NULL, /* tab_to_uni */
3488 NULL, /* tab_from_uni */
3489 &my_unicase_default, /* caseinfo */
3490 NULL, /* state_map */
3491 NULL, /* ident_map */
3492 1, /* strxfrm_multiply */
3493 1, /* caseup_multiply */
3494 1, /* casedn_multiply */
3495 2, /* mbminlen */
3496 2, /* mbmaxlen */
3497 0, /* min_sort_char */
3498 0xFFFF, /* max_sort_char */
3499 ' ', /* pad char */
3500 0, /* escape_with_backslash_is_dangerous */
3501 1, /* levels_for_order */
3502 &my_charset_ucs2_handler,
3503 &my_collation_ucs2_general_nopad_ci_handler
3504 };
3505
3506
3507 struct charset_info_st my_charset_ucs2_nopad_bin=
3508 {
3509 MY_NOPAD_ID(90),0,0, /* number */
3510 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
3511 charset_name_ucs2, /* cs name */
3512 "ucs2_nopad_bin", /* name */
3513 "", /* comment */
3514 NULL, /* tailoring */
3515 ctype_ucs2, /* ctype */
3516 to_lower_ucs2, /* to_lower */
3517 to_upper_ucs2, /* to_upper */
3518 NULL, /* sort_order */
3519 NULL, /* uca */
3520 NULL, /* tab_to_uni */
3521 NULL, /* tab_from_uni */
3522 &my_unicase_default, /* caseinfo */
3523 NULL, /* state_map */
3524 NULL, /* ident_map */
3525 1, /* strxfrm_multiply */
3526 1, /* caseup_multiply */
3527 1, /* casedn_multiply */
3528 2, /* mbminlen */
3529 2, /* mbmaxlen */
3530 0, /* min_sort_char */
3531 0xFFFF, /* max_sort_char */
3532 ' ', /* pad char */
3533 0, /* escape_with_backslash_is_dangerous */
3534 1, /* levels_for_order */
3535 &my_charset_ucs2_handler,
3536 &my_collation_ucs2_nopad_bin_handler
3537 };
3538
3539 #endif /* HAVE_CHARSET_ucs2 */
3540