1 /* Copyright (c) 2003, 2013, Oracle and/or its affiliates
2 Copyright (c) 2009, 2016, MariaDB
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Library General Public
6 License as published by the Free Software Foundation; version 2
7 of the License.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
13
14 You should have received a copy of the GNU Library General Public
15 License along with this library; if not, write to the Free
16 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
17 MA 02110-1335 USA */
18
19 /* UCS2 support. Written by Alexander Barkov <bar@mysql.com> */
20
21 #include "strings_def.h"
22 #include <m_ctype.h>
23 #include <my_sys.h>
24 #include <stdarg.h>
25
26 #include "ctype-unidata.h"
27
28
29 #if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
30 #define HAVE_CHARSET_mb2
31 #endif
32
33
34 #if defined(HAVE_CHARSET_mb2) || defined(HAVE_CHARSET_utf32)
35 #define HAVE_CHARSET_mb2_or_mb4
36 #endif
37
38
39 #ifndef EILSEQ
40 #define EILSEQ ENOENT
41 #endif
42
43 #undef ULONGLONG_MAX
44 #define ULONGLONG_MAX (~(ulonglong) 0)
45 #define MAX_NEGATIVE_NUMBER ((ulonglong) 0x8000000000000000LL)
46 #define INIT_CNT 9
47 #define LFACTOR 1000000000ULL
48 #define LFACTOR1 10000000000ULL
49 #define LFACTOR2 100000000000ULL
50
51 #if defined(HAVE_CHARSET_utf32) || defined(HAVE_CHARSET_mb2)
52 static unsigned long lfactor[9]=
53 { 1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L };
54 #endif
55
56
57 #ifdef HAVE_CHARSET_mb2_or_mb4
58 static size_t
my_caseup_str_mb2_or_mb4(CHARSET_INFO * cs,char * s)59 my_caseup_str_mb2_or_mb4(CHARSET_INFO * cs __attribute__((unused)),
60 char * s __attribute__((unused)))
61 {
62 DBUG_ASSERT(0);
63 return 0;
64 }
65
66
67 static size_t
my_casedn_str_mb2_or_mb4(CHARSET_INFO * cs,char * s)68 my_casedn_str_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
69 char * s __attribute__((unused)))
70 {
71 DBUG_ASSERT(0);
72 return 0;
73 }
74
75
76 static int
my_strcasecmp_mb2_or_mb4(CHARSET_INFO * cs,const char * s,const char * t)77 my_strcasecmp_mb2_or_mb4(CHARSET_INFO *cs __attribute__((unused)),
78 const char *s __attribute__((unused)),
79 const char *t __attribute__((unused)))
80 {
81 DBUG_ASSERT(0);
82 return 0;
83 }
84
85
86 typedef enum
87 {
88 MY_CHAR_COPY_OK= 0, /* The character was Okey */
89 MY_CHAR_COPY_ERROR= 1, /* The character was not Ok, and could not fix */
90 MY_CHAR_COPY_FIXED= 2 /* The character was not Ok, was fixed to '?' */
91 } my_char_copy_status_t;
92
93
94 /*
95 Copies an incomplete character, lef-padding it with 0x00 bytes.
96
97 @param cs Character set
98 @param dst The destination string
99 @param dst_length Space available in dst
100 @param src The source string
101 @param src_length Length of src
102 @param nchars Copy not more than nchars characters.
103 The "nchars" parameter of the caller.
104 Only 0 and non-0 are important here.
105 @param fix What to do if after zero-padding didn't get a valid
106 character:
107 - FALSE - exit with error.
108 - TRUE - try to put '?' instead.
109
110 @return MY_CHAR_COPY_OK if after zero-padding got a valid character.
111 cs->mbmaxlen bytes were written to "dst".
112 @return MY_CHAR_COPY_FIXED if after zero-padding did not get a valid
113 character, but wrote '?' to the destination
114 string instead.
115 cs->mbminlen bytes were written to "dst".
116 @return MY_CHAR_COPY_ERROR If failed and nothing was written to "dst".
117 Possible reasons:
118 - dst_length was too short
119 - nchars was 0
120 - the character after padding appeared not
121 to be valid, and could not fix it to '?'.
122 */
123 static my_char_copy_status_t
my_copy_incomplete_char(CHARSET_INFO * cs,char * dst,size_t dst_length,const char * src,size_t src_length,size_t nchars,my_bool fix)124 my_copy_incomplete_char(CHARSET_INFO *cs,
125 char *dst, size_t dst_length,
126 const char *src, size_t src_length,
127 size_t nchars, my_bool fix)
128 {
129 size_t pad_length;
130 size_t src_offset= src_length % cs->mbminlen;
131 if (dst_length < cs->mbminlen || !nchars)
132 return MY_CHAR_COPY_ERROR;
133
134 pad_length= cs->mbminlen - src_offset;
135 bzero(dst, pad_length);
136 memmove(dst + pad_length, src, src_offset);
137 /*
138 In some cases left zero-padding can create an incorrect character.
139 For example:
140 INSERT INTO t1 (utf32_column) VALUES (0x110000);
141 We'll pad the value to 0x00110000, which is a wrong UTF32 sequence!
142 The valid characters range is limited to 0x00000000..0x0010FFFF.
143
144 Make sure we didn't pad to an incorrect character.
145 */
146 if (cs->cset->charlen(cs, (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
147 (int) cs->mbminlen)
148 return MY_CHAR_COPY_OK;
149
150 if (fix &&
151 cs->cset->wc_mb(cs, '?', (uchar *) dst, (uchar *) dst + cs->mbminlen) ==
152 (int) cs->mbminlen)
153 return MY_CHAR_COPY_FIXED;
154
155 return MY_CHAR_COPY_ERROR;
156 }
157
158
159 /*
160 Copy an UCS2/UTF16/UTF32 string, fix bad characters.
161 */
162 static size_t
my_copy_fix_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t dst_length,const char * src,size_t src_length,size_t nchars,MY_STRCOPY_STATUS * status)163 my_copy_fix_mb2_or_mb4(CHARSET_INFO *cs,
164 char *dst, size_t dst_length,
165 const char *src, size_t src_length,
166 size_t nchars, MY_STRCOPY_STATUS *status)
167 {
168 size_t length2, src_offset= src_length % cs->mbminlen;
169 my_char_copy_status_t padstatus;
170
171 if (!src_offset)
172 return my_copy_fix_mb(cs, dst, dst_length,
173 src, src_length, nchars, status);
174 if ((padstatus= my_copy_incomplete_char(cs, dst, dst_length,
175 src, src_length, nchars, TRUE)) ==
176 MY_CHAR_COPY_ERROR)
177 {
178 status->m_source_end_pos= status->m_well_formed_error_pos= src;
179 return 0;
180 }
181 length2= my_copy_fix_mb(cs, dst + cs->mbminlen, dst_length - cs->mbminlen,
182 src + src_offset, src_length - src_offset,
183 nchars - 1, status);
184 if (padstatus == MY_CHAR_COPY_FIXED)
185 status->m_well_formed_error_pos= src;
186 return cs->mbminlen /* The left-padded character */ + length2;
187 }
188
189
190 static long
my_strntol_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)191 my_strntol_mb2_or_mb4(CHARSET_INFO *cs,
192 const char *nptr, size_t l, int base,
193 char **endptr, int *err)
194 {
195 int negative= 0;
196 int overflow;
197 int cnv;
198 my_wc_t wc;
199 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
200 register unsigned int cutlim;
201 register uint32 cutoff;
202 register uint32 res;
203 register const uchar *s= (const uchar*) nptr;
204 register const uchar *e= (const uchar*) nptr+l;
205 const uchar *save;
206
207 *err= 0;
208 do
209 {
210 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
211 {
212 switch (wc)
213 {
214 case ' ' : break;
215 case '\t': break;
216 case '-' : negative= !negative; break;
217 case '+' : break;
218 default : goto bs;
219 }
220 }
221 else /* No more characters or bad multibyte sequence */
222 {
223 if (endptr != NULL )
224 *endptr= (char*) s;
225 err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
226 return 0;
227 }
228 s+= cnv;
229 } while (1);
230
231 bs:
232
233 overflow= 0;
234 res= 0;
235 save= s;
236 cutoff= ((uint32)~0L) / (uint32) base;
237 cutlim= (uint) (((uint32)~0L) % (uint32) base);
238
239 do {
240 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
241 {
242 s+= cnv;
243 if (wc >= '0' && wc <= '9')
244 wc-= '0';
245 else if (wc >= 'A' && wc <= 'Z')
246 wc= wc - 'A' + 10;
247 else if (wc >= 'a' && wc <= 'z')
248 wc= wc - 'a' + 10;
249 else
250 break;
251 if ((int)wc >= base)
252 break;
253 if (res > cutoff || (res == cutoff && wc > cutlim))
254 overflow= 1;
255 else
256 {
257 res*= (uint32) base;
258 res+= wc;
259 }
260 }
261 else if (cnv == MY_CS_ILSEQ)
262 {
263 if (endptr !=NULL )
264 *endptr = (char*) s;
265 err[0]= EILSEQ;
266 return 0;
267 }
268 else
269 {
270 /* No more characters */
271 break;
272 }
273 } while(1);
274
275 if (endptr != NULL)
276 *endptr = (char *) s;
277
278 if (s == save)
279 {
280 err[0]= EDOM;
281 return 0L;
282 }
283
284 if (negative)
285 {
286 if (res > (uint32) INT_MIN32)
287 overflow= 1;
288 }
289 else if (res > INT_MAX32)
290 overflow= 1;
291
292 if (overflow)
293 {
294 err[0]= ERANGE;
295 return negative ? INT_MIN32 : INT_MAX32;
296 }
297
298 return (negative ? -((long) res) : (long) res);
299 }
300
301
302 static ulong
my_strntoul_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)303 my_strntoul_mb2_or_mb4(CHARSET_INFO *cs,
304 const char *nptr, size_t l, int base,
305 char **endptr, int *err)
306 {
307 int negative= 0;
308 int overflow;
309 int cnv;
310 my_wc_t wc;
311 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
312 register unsigned int cutlim;
313 register uint32 cutoff;
314 register uint32 res;
315 register const uchar *s= (const uchar*) nptr;
316 register const uchar *e= (const uchar*) nptr + l;
317 const uchar *save;
318
319 *err= 0;
320 do
321 {
322 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
323 {
324 switch (wc)
325 {
326 case ' ' : break;
327 case '\t': break;
328 case '-' : negative= !negative; break;
329 case '+' : break;
330 default : goto bs;
331 }
332 }
333 else /* No more characters or bad multibyte sequence */
334 {
335 if (endptr !=NULL )
336 *endptr= (char*)s;
337 err[0]= (cnv == MY_CS_ILSEQ) ? EILSEQ : EDOM;
338 return 0;
339 }
340 s+= cnv;
341 } while (1);
342
343 bs:
344
345 overflow= 0;
346 res= 0;
347 save= s;
348 cutoff= ((uint32)~0L) / (uint32) base;
349 cutlim= (uint) (((uint32)~0L) % (uint32) base);
350
351 do
352 {
353 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
354 {
355 s+= cnv;
356 if (wc >= '0' && wc <= '9')
357 wc-= '0';
358 else if (wc >= 'A' && wc <= 'Z')
359 wc= wc - 'A' + 10;
360 else if (wc >= 'a' && wc <= 'z')
361 wc= wc - 'a' + 10;
362 else
363 break;
364 if ((int) wc >= base)
365 break;
366 if (res > cutoff || (res == cutoff && wc > cutlim))
367 overflow = 1;
368 else
369 {
370 res*= (uint32) base;
371 res+= wc;
372 }
373 }
374 else if (cnv == MY_CS_ILSEQ)
375 {
376 if (endptr != NULL )
377 *endptr= (char*)s;
378 err[0]= EILSEQ;
379 return 0;
380 }
381 else
382 {
383 /* No more characters */
384 break;
385 }
386 } while(1);
387
388 if (endptr != NULL)
389 *endptr= (char *) s;
390
391 if (s == save)
392 {
393 err[0]= EDOM;
394 return 0L;
395 }
396
397 if (overflow)
398 {
399 err[0]= (ERANGE);
400 return (~(uint32) 0);
401 }
402
403 return (negative ? -((long) res) : (long) res);
404 }
405
406
407 static longlong
my_strntoll_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)408 my_strntoll_mb2_or_mb4(CHARSET_INFO *cs,
409 const char *nptr, size_t l, int base,
410 char **endptr, int *err)
411 {
412 int negative=0;
413 int overflow;
414 int cnv;
415 my_wc_t wc;
416 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
417 register ulonglong cutoff;
418 register unsigned int cutlim;
419 register ulonglong res;
420 register const uchar *s= (const uchar*) nptr;
421 register const uchar *e= (const uchar*) nptr+l;
422 const uchar *save;
423
424 *err= 0;
425 do
426 {
427 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
428 {
429 switch (wc)
430 {
431 case ' ' : break;
432 case '\t': break;
433 case '-' : negative= !negative; break;
434 case '+' : break;
435 default : goto bs;
436 }
437 }
438 else /* No more characters or bad multibyte sequence */
439 {
440 if (endptr !=NULL )
441 *endptr = (char*)s;
442 err[0] = (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
443 return 0;
444 }
445 s+=cnv;
446 } while (1);
447
448 bs:
449
450 overflow = 0;
451 res = 0;
452 save = s;
453 cutoff = (~(ulonglong) 0) / (unsigned long int) base;
454 cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
455
456 do {
457 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
458 {
459 s+=cnv;
460 if ( wc>='0' && wc<='9')
461 wc -= '0';
462 else if ( wc>='A' && wc<='Z')
463 wc = wc - 'A' + 10;
464 else if ( wc>='a' && wc<='z')
465 wc = wc - 'a' + 10;
466 else
467 break;
468 if ((int)wc >= base)
469 break;
470 if (res > cutoff || (res == cutoff && wc > cutlim))
471 overflow = 1;
472 else
473 {
474 res *= (ulonglong) base;
475 res += wc;
476 }
477 }
478 else if (cnv==MY_CS_ILSEQ)
479 {
480 if (endptr !=NULL )
481 *endptr = (char*)s;
482 err[0]=EILSEQ;
483 return 0;
484 }
485 else
486 {
487 /* No more characters */
488 break;
489 }
490 } while(1);
491
492 if (endptr != NULL)
493 *endptr = (char *) s;
494
495 if (s == save)
496 {
497 err[0]=EDOM;
498 return 0L;
499 }
500
501 if (negative)
502 {
503 if (res > (ulonglong) LONGLONG_MIN)
504 overflow = 1;
505 }
506 else if (res > (ulonglong) LONGLONG_MAX)
507 overflow = 1;
508
509 if (overflow)
510 {
511 err[0]=ERANGE;
512 return negative ? LONGLONG_MIN : LONGLONG_MAX;
513 }
514
515 return (negative ? -((longlong)res) : (longlong)res);
516 }
517
518
519 static ulonglong
my_strntoull_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t l,int base,char ** endptr,int * err)520 my_strntoull_mb2_or_mb4(CHARSET_INFO *cs,
521 const char *nptr, size_t l, int base,
522 char **endptr, int *err)
523 {
524 int negative= 0;
525 int overflow;
526 int cnv;
527 my_wc_t wc;
528 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
529 register ulonglong cutoff;
530 register unsigned int cutlim;
531 register ulonglong res;
532 register const uchar *s= (const uchar*) nptr;
533 register const uchar *e= (const uchar*) nptr + l;
534 const uchar *save;
535
536 *err= 0;
537 do
538 {
539 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
540 {
541 switch (wc)
542 {
543 case ' ' : break;
544 case '\t': break;
545 case '-' : negative= !negative; break;
546 case '+' : break;
547 default : goto bs;
548 }
549 }
550 else /* No more characters or bad multibyte sequence */
551 {
552 if (endptr !=NULL )
553 *endptr = (char*)s;
554 err[0]= (cnv==MY_CS_ILSEQ) ? EILSEQ : EDOM;
555 return 0;
556 }
557 s+=cnv;
558 } while (1);
559
560 bs:
561
562 overflow = 0;
563 res = 0;
564 save = s;
565 cutoff = (~(ulonglong) 0) / (unsigned long int) base;
566 cutlim = (uint) ((~(ulonglong) 0) % (unsigned long int) base);
567
568 do
569 {
570 if ((cnv= mb_wc(cs, &wc, s, e)) > 0)
571 {
572 s+=cnv;
573 if ( wc>='0' && wc<='9')
574 wc -= '0';
575 else if ( wc>='A' && wc<='Z')
576 wc = wc - 'A' + 10;
577 else if ( wc>='a' && wc<='z')
578 wc = wc - 'a' + 10;
579 else
580 break;
581 if ((int)wc >= base)
582 break;
583 if (res > cutoff || (res == cutoff && wc > cutlim))
584 overflow = 1;
585 else
586 {
587 res *= (ulonglong) base;
588 res += wc;
589 }
590 }
591 else if (cnv==MY_CS_ILSEQ)
592 {
593 if (endptr !=NULL )
594 *endptr = (char*)s;
595 err[0]= EILSEQ;
596 return 0;
597 }
598 else
599 {
600 /* No more characters */
601 break;
602 }
603 } while(1);
604
605 if (endptr != NULL)
606 *endptr = (char *) s;
607
608 if (s == save)
609 {
610 err[0]= EDOM;
611 return 0L;
612 }
613
614 if (overflow)
615 {
616 err[0]= ERANGE;
617 return (~(ulonglong) 0);
618 }
619
620 return (negative ? -((longlong) res) : (longlong) res);
621 }
622
623
624 static double
my_strntod_mb2_or_mb4(CHARSET_INFO * cs,char * nptr,size_t length,char ** endptr,int * err)625 my_strntod_mb2_or_mb4(CHARSET_INFO *cs,
626 char *nptr, size_t length,
627 char **endptr, int *err)
628 {
629 char buf[256];
630 double res;
631 register char *b= buf;
632 register const uchar *s= (const uchar*) nptr;
633 const uchar *end;
634 my_wc_t wc;
635 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
636 int cnv;
637
638 *err= 0;
639 /* Cut too long strings */
640 if (length >= sizeof(buf))
641 length= sizeof(buf) - 1;
642 end= s + length;
643
644 while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
645 {
646 s+= cnv;
647 if (wc > (int) (uchar) 'e' || !wc)
648 break; /* Can't be part of double */
649 *b++= (char) wc;
650 }
651
652 *endptr= b;
653 res= my_strtod(buf, endptr, err);
654 *endptr= nptr + cs->mbminlen * (size_t) (*endptr - buf);
655 return res;
656 }
657
658
659 static ulonglong
my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO * cs,const char * nptr,size_t length,int unsign_fl,char ** endptr,int * err)660 my_strntoull10rnd_mb2_or_mb4(CHARSET_INFO *cs,
661 const char *nptr, size_t length,
662 int unsign_fl,
663 char **endptr, int *err)
664 {
665 char buf[256], *b= buf;
666 ulonglong res;
667 const uchar *end, *s= (const uchar*) nptr;
668 my_wc_t wc;
669 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
670 int cnv;
671
672 /* Cut too long strings */
673 if (length >= sizeof(buf))
674 length= sizeof(buf)-1;
675 end= s + length;
676
677 while ((cnv= mb_wc(cs, &wc, s, end)) > 0)
678 {
679 s+= cnv;
680 if (wc > (int) (uchar) 'e' || !wc)
681 break; /* Can't be a number part */
682 *b++= (char) wc;
683 }
684
685 res= my_strntoull10rnd_8bit(cs, buf, b - buf, unsign_fl, endptr, err);
686 *endptr= (char*) nptr + cs->mbminlen * (size_t) (*endptr - buf);
687 return res;
688 }
689
690
691 /*
692 This is a fast version optimized for the case of radix 10 / -10
693 */
694
695 static size_t
my_l10tostr_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t len,int radix,long int val)696 my_l10tostr_mb2_or_mb4(CHARSET_INFO *cs,
697 char *dst, size_t len, int radix, long int val)
698 {
699 char buffer[66];
700 register char *p, *db, *de;
701 long int new_val;
702 int sl= 0;
703 unsigned long int uval = (unsigned long int) val;
704
705 p= &buffer[sizeof(buffer) - 1];
706 *p= '\0';
707
708 if (radix < 0)
709 {
710 if (val < 0)
711 {
712 sl= 1;
713 /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
714 uval = (unsigned long int)0 - uval;
715 }
716 }
717
718 new_val = (long) (uval / 10);
719 *--p = '0'+ (char) (uval - (unsigned long) new_val * 10);
720 val= new_val;
721
722 while (val != 0)
723 {
724 new_val= val / 10;
725 *--p= '0' + (char) (val - new_val * 10);
726 val= new_val;
727 }
728
729 if (sl)
730 {
731 *--p= '-';
732 }
733
734 for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
735 {
736 int cnvres= cs->cset->wc_mb(cs,(my_wc_t)p[0],(uchar*) dst, (uchar*) de);
737 if (cnvres > 0)
738 dst+= cnvres;
739 else
740 break;
741 }
742 return (int) (dst - db);
743 }
744
745
746 static size_t
my_ll10tostr_mb2_or_mb4(CHARSET_INFO * cs,char * dst,size_t len,int radix,longlong val)747 my_ll10tostr_mb2_or_mb4(CHARSET_INFO *cs,
748 char *dst, size_t len, int radix, longlong val)
749 {
750 char buffer[65];
751 register char *p, *db, *de;
752 long long_val;
753 int sl= 0;
754 ulonglong uval= (ulonglong) val;
755
756 if (radix < 0)
757 {
758 if (val < 0)
759 {
760 sl= 1;
761 /* Avoid integer overflow in (-val) for LONGLONG_MIN (BUG#31799). */
762 uval = (ulonglong)0 - uval;
763 }
764 }
765
766 p= &buffer[sizeof(buffer)-1];
767 *p='\0';
768
769 if (uval == 0)
770 {
771 *--p= '0';
772 goto cnv;
773 }
774
775 while (uval > (ulonglong) LONG_MAX)
776 {
777 ulonglong quo= uval/(uint) 10;
778 uint rem= (uint) (uval- quo* (uint) 10);
779 *--p= '0' + rem;
780 uval= quo;
781 }
782
783 long_val= (long) uval;
784 while (long_val != 0)
785 {
786 long quo= long_val/10;
787 *--p= (char) ('0' + (long_val - quo*10));
788 long_val= quo;
789 }
790
791 cnv:
792 if (sl)
793 {
794 *--p= '-';
795 }
796
797 for ( db= dst, de= dst + len ; (dst < de) && *p ; p++)
798 {
799 int cnvres= cs->cset->wc_mb(cs, (my_wc_t) p[0], (uchar*) dst, (uchar*) de);
800 if (cnvres > 0)
801 dst+= cnvres;
802 else
803 break;
804 }
805 return (int) (dst -db);
806 }
807
808 #endif /* HAVE_CHARSET_mb2_or_mb4 */
809
810
811 #ifdef HAVE_CHARSET_mb2
812 /**
813 Convert a Unicode code point to a digit.
814 @param wc - the input Unicode code point
815 @param[OUT] c - the output character representing the digit value 0..9
816
817 @return 0 - if wc is a good digit
818 @return 1 - if wc is not a digit
819 */
820 static inline my_bool
wc2digit_uchar(uchar * c,my_wc_t wc)821 wc2digit_uchar(uchar *c, my_wc_t wc)
822 {
823 return wc > '9' || (c[0]= (uchar) (wc - '0')) > 9;
824 }
825
826
827 static longlong
my_strtoll10_mb2(CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)828 my_strtoll10_mb2(CHARSET_INFO *cs __attribute__((unused)),
829 const char *nptr, char **endptr, int *error)
830 {
831 const uchar *s, *end, *start, *n_end, *true_end;
832 uchar UNINIT_VAR(c);
833 unsigned long i, j, k;
834 ulonglong li;
835 int negative;
836 ulong cutoff, cutoff2, cutoff3;
837 my_wc_t wc;
838 int res;
839 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
840
841 s= (const uchar *) nptr;
842 /* If fixed length string */
843 if (endptr)
844 {
845 /*
846 Make sure string length is even.
847 Odd length indicates a bug in the caller.
848 Assert in debug, round in production.
849 */
850 DBUG_ASSERT((*endptr - (const char *) s) % 2 == 0);
851 end= s + ((*endptr - (const char*) s) / 2) * 2;
852
853 for ( ; ; ) /* Skip leading spaces and tabs */
854 {
855 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
856 goto no_conv;
857 s+= res;
858 if (wc != ' ' && wc != '\t')
859 break;
860 }
861 }
862 else
863 {
864 /* We don't support null terminated strings in UCS2 */
865 goto no_conv;
866 }
867
868 /* Check for a sign. */
869 negative= 0;
870 if (wc == '-')
871 {
872 *error= -1; /* Mark as negative number */
873 negative= 1;
874 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
875 goto no_conv;
876 s+= res; /* wc is now expected to hold the first digit. */
877 cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
878 cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
879 cutoff3= MAX_NEGATIVE_NUMBER % 100;
880 }
881 else
882 {
883 *error= 0;
884 if (wc == '+')
885 {
886 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
887 goto no_conv;
888 s+= res; /* wc is now expected to hold the first digit. */
889 }
890 cutoff= ULONGLONG_MAX / LFACTOR2;
891 cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
892 cutoff3= ULONGLONG_MAX % 100;
893 }
894
895 /*
896 The code below assumes that 'wc' holds the first digit
897 and 's' points to the next character after it.
898
899 Scan pre-zeros if any.
900 */
901 if (wc == '0')
902 {
903 i= 0;
904 for ( ; ; s+= res)
905 {
906 if (s == end)
907 goto end_i; /* Return 0 */
908 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
909 goto no_conv;
910 if (wc != '0')
911 break;
912 }
913 n_end= s + 2 * INIT_CNT;
914 }
915 else
916 {
917 /* Read first digit to check that it's a valid number */
918 if ((i= (wc - '0')) > 9)
919 goto no_conv;
920 n_end= s + 2 * (INIT_CNT-1);
921 }
922
923 /* Handle first 9 digits and store them in i */
924 if (n_end > end)
925 n_end= end;
926 for ( ; ; s+= res)
927 {
928 if ((res= mb_wc(cs, &wc, s, n_end)) <= 0)
929 break;
930 if (wc2digit_uchar(&c, wc))
931 goto end_i;
932 i= i*10+c;
933 }
934 if (s == end)
935 goto end_i;
936
937 /* Handle next 9 digits and store them in j */
938 j= 0;
939 start= s; /* Used to know how much to shift i */
940 n_end= true_end= s + 2 * INIT_CNT;
941 if (n_end > end)
942 n_end= end;
943 do
944 {
945 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
946 goto no_conv;
947 if (wc2digit_uchar(&c, wc))
948 goto end_i_and_j;
949 s+= res;
950 j= j * 10 + c;
951 } while (s != n_end);
952 if (s == end)
953 {
954 if (s != true_end)
955 goto end_i_and_j;
956 goto end3;
957 }
958
959 /* Handle the next 1 or 2 digits and store them in k */
960 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
961 goto no_conv;
962 if ((k= (wc - '0')) > 9)
963 goto end3;
964 s+= res;
965
966 if (s == end)
967 goto end4;
968 if ((res= mb_wc(cs, &wc, s, end)) <= 0)
969 goto no_conv;
970 if (wc2digit_uchar(&c, wc))
971 goto end4;
972 s+= res;
973 k= k*10+c;
974 *endptr= (char*) s;
975
976 /* number string should have ended here */
977 if (s != end && mb_wc(cs, &wc, s, end) > 0 && ((uchar) (wc - '0')) <= 9)
978 goto overflow;
979
980 /* Check that we didn't get an overflow with the last digit */
981 if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
982 k > cutoff3)))
983 goto overflow;
984 li=i*LFACTOR2+ (ulonglong) j*100 + k;
985 return (longlong) li;
986
987 overflow: /* *endptr is set here */
988 *error= MY_ERRNO_ERANGE;
989 return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
990
991 end_i:
992 *endptr= (char*) s;
993 return (negative ? ((longlong) -(long) i) : (longlong) i);
994
995 end_i_and_j:
996 li= (ulonglong) i * lfactor[(size_t) (s-start) / 2] + j;
997 *endptr= (char*) s;
998 return (negative ? -((longlong) li) : (longlong) li);
999
1000 end3:
1001 li=(ulonglong) i*LFACTOR+ (ulonglong) j;
1002 *endptr= (char*) s;
1003 return (negative ? -((longlong) li) : (longlong) li);
1004
1005 end4:
1006 li=(ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
1007 *endptr= (char*) s;
1008 if (negative)
1009 {
1010 if (li > MAX_NEGATIVE_NUMBER)
1011 goto overflow;
1012 return -((longlong) li);
1013 }
1014 return (longlong) li;
1015
1016 no_conv:
1017 /* There was no number to convert. */
1018 *error= MY_ERRNO_EDOM;
1019 *endptr= (char *) nptr;
1020 return 0;
1021 }
1022
1023
1024 static size_t
my_scan_mb2(CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)1025 my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)),
1026 const char *str, const char *end, int sequence_type)
1027 {
1028 const char *str0= str;
1029 my_wc_t wc;
1030 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1031 int res;
1032
1033 switch (sequence_type)
1034 {
1035 case MY_SEQ_SPACES:
1036 for (res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
1037 res > 0 && wc == ' ';
1038 str+= res,
1039 res= mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end))
1040 {
1041 }
1042 return (size_t) (str - str0);
1043 case MY_SEQ_NONSPACES:
1044 DBUG_ASSERT(0); /* Not implemented */
1045 /* pass through */
1046 default:
1047 return 0;
1048 }
1049 }
1050
1051
1052 static void
my_fill_mb2(CHARSET_INFO * cs,char * s,size_t slen,int fill)1053 my_fill_mb2(CHARSET_INFO *cs, char *s, size_t slen, int fill)
1054 {
1055 char buf[10], *last;
1056 size_t buflen, remainder;
1057
1058 DBUG_ASSERT((slen % 2) == 0);
1059
1060 buflen= cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
1061 (uchar*) buf + sizeof(buf));
1062
1063 DBUG_ASSERT(buflen > 0);
1064
1065 /*
1066 "last" in the last position where a sequence of "buflen" bytes can start.
1067 */
1068 for (last= s + slen - buflen; s <= last; s+= buflen)
1069 {
1070 /* Enough space for the character */
1071 memcpy(s, buf, buflen);
1072 }
1073
1074 /*
1075 If there are some more space which is not enough
1076 for the whole multibyte character, then add trailing zeros.
1077 */
1078 if ((remainder= last + buflen - s) > 0)
1079 bzero(s, (size_t) remainder);
1080 }
1081
1082
1083 static size_t
my_vsnprintf_mb2(char * dst,size_t n,const char * fmt,va_list ap)1084 my_vsnprintf_mb2(char *dst, size_t n, const char* fmt, va_list ap)
1085 {
1086 char *start=dst, *end= dst + n - 1;
1087 for (; *fmt ; fmt++)
1088 {
1089 if (fmt[0] != '%')
1090 {
1091 if (dst == end) /* End of buffer */
1092 break;
1093
1094 *dst++='\0';
1095 *dst++= *fmt; /* Copy ordinary char */
1096 continue;
1097 }
1098
1099 fmt++;
1100
1101 /* Skip if max size is used (to be compatible with printf) */
1102 while ( (*fmt >= '0' && *fmt <= '9') || *fmt == '.' || *fmt == '-')
1103 fmt++;
1104
1105 if (*fmt == 'l')
1106 fmt++;
1107
1108 if (*fmt == 's') /* String parameter */
1109 {
1110 char *par= va_arg(ap, char *);
1111 size_t plen;
1112 size_t left_len= (size_t)(end-dst);
1113 if (!par)
1114 par= (char*) "(null)";
1115 plen= strlen(par);
1116 if (left_len <= plen * 2)
1117 plen = left_len / 2 - 1;
1118
1119 for ( ; plen ; plen--, dst+=2, par++)
1120 {
1121 dst[0]= '\0';
1122 dst[1]= par[0];
1123 }
1124 continue;
1125 }
1126 else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
1127 {
1128 int iarg;
1129 char nbuf[16];
1130 char *pbuf= nbuf;
1131
1132 if ((size_t) (end - dst) < 32)
1133 break;
1134 iarg= va_arg(ap, int);
1135 if (*fmt == 'd')
1136 int10_to_str((long) iarg, nbuf, -10);
1137 else
1138 int10_to_str((long) (uint) iarg, nbuf,10);
1139
1140 for (; pbuf[0]; pbuf++)
1141 {
1142 *dst++= '\0';
1143 *dst++= *pbuf;
1144 }
1145 continue;
1146 }
1147
1148 /* We come here on '%%', unknown code or too long parameter */
1149 if (dst == end)
1150 break;
1151 *dst++= '\0';
1152 *dst++= '%'; /* % used as % or unknown code */
1153 }
1154
1155 DBUG_ASSERT(dst <= end);
1156 *dst='\0'; /* End of errmessage */
1157 return (size_t) (dst - start);
1158 }
1159
1160
1161 static size_t
my_snprintf_mb2(CHARSET_INFO * cs,char * to,size_t n,const char * fmt,...)1162 my_snprintf_mb2(CHARSET_INFO *cs __attribute__((unused)),
1163 char* to, size_t n, const char* fmt, ...)
1164 {
1165 size_t ret;
1166 va_list args;
1167 va_start(args,fmt);
1168 ret= my_vsnprintf_mb2(to, n, fmt, args);
1169 va_end(args);
1170 return ret;
1171 }
1172
1173
1174 static size_t
my_lengthsp_mb2(CHARSET_INFO * cs,const char * ptr,size_t length)1175 my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
1176 const char *ptr, size_t length)
1177 {
1178 const char *end= ptr + length;
1179 while (end > ptr + 1 && end[-1] == ' ' && end[-2] == '\0')
1180 end-= 2;
1181 return (size_t) (end - ptr);
1182 }
1183
1184 #endif /* HAVE_CHARSET_mb2*/
1185
1186
1187 /*
1188 Next part is actually HAVE_CHARSET_utf16-specific,
1189 but the JSON functions needed my_utf16_uni()
1190 so the #ifdef was moved lower.
1191 */
1192 #include "ctype-utf16.h"
1193
1194 #define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0))
1195 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
1196
my_weight_mb2_utf16mb2_general_ci(uchar b0,uchar b1)1197 static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
1198 {
1199 my_wc_t wc= MY_UTF16_WC2(b0, b1);
1200 MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
1201 return (int) (page ? page[wc & 0xFF].sort : wc);
1202 }
1203 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci
1204 #define DEFINE_STRNXFRM_UNICODE
1205 #define DEFINE_STRNXFRM_UNICODE_NOPAD
1206 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf16_quick(pwc, s, e)
1207 #define OPTIMIZE_ASCII 0
1208 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
1209 #define UNICASE_PAGE0 my_unicase_default_page00
1210 #define UNICASE_PAGES my_unicase_default_pages
1211 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1212 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
1213 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1214 #include "strcoll.inl"
1215
1216 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_bin
1217 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1218 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1))
1219 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
1220 #include "strcoll.inl"
1221
1222 #define DEFINE_STRNNCOLLSP_NOPAD
1223 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_nopad_ci
1224 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1225 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
1226 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1227 #include "strcoll.inl"
1228
1229 #define DEFINE_STRNNCOLLSP_NOPAD
1230 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_nopad_bin
1231 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1232 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1))
1233 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
1234 #include "strcoll.inl"
1235
1236 #undef IS_MB2_CHAR
1237 #undef IS_MB4_CHAR
1238
1239 /*
1240 These two functions are used in JSON library, so made exportable
1241 and unconditionally compiled into the library.
1242 */
1243
1244 /*static*/ int
my_utf16_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)1245 my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
1246 my_wc_t *pwc, const uchar *s, const uchar *e)
1247 {
1248 return my_mb_wc_utf16_quick(pwc, s, e);
1249 }
1250
1251
1252 /*static*/ int
my_uni_utf16(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)1253 my_uni_utf16(CHARSET_INFO *cs __attribute__((unused)),
1254 my_wc_t wc, uchar *s, uchar *e)
1255 {
1256 if (wc <= 0xFFFF)
1257 {
1258 if (s + 2 > e)
1259 return MY_CS_TOOSMALL2;
1260 if (MY_UTF16_SURROGATE(wc))
1261 return MY_CS_ILUNI;
1262 *s++= (uchar) (wc >> 8);
1263 *s= (uchar) (wc & 0xFF);
1264 return 2;
1265 }
1266
1267 if (wc <= 0x10FFFF)
1268 {
1269 if (s + 4 > e)
1270 return MY_CS_TOOSMALL4;
1271 *s++= (uchar) ((wc-= 0x10000) >> 18) | 0xD8;
1272 *s++= (uchar) (wc >> 10) & 0xFF;
1273 *s++= (uchar) ((wc >> 8) & 3) | 0xDC;
1274 *s= (uchar) wc & 0xFF;
1275 return 4;
1276 }
1277
1278 return MY_CS_ILUNI;
1279 }
1280
1281
1282 #ifdef HAVE_CHARSET_utf16
1283
1284
1285 static inline void
my_tolower_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1286 my_tolower_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1287 {
1288 MY_UNICASE_CHARACTER *page;
1289 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1290 *wc= page[*wc & 0xFF].tolower;
1291 }
1292
1293
1294 static inline void
my_toupper_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1295 my_toupper_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1296 {
1297 MY_UNICASE_CHARACTER *page;
1298 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
1299 *wc= page[*wc & 0xFF].toupper;
1300 }
1301
1302
1303 static inline void
my_tosort_utf16(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)1304 my_tosort_utf16(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
1305 {
1306 if (*wc <= uni_plane->maxchar)
1307 {
1308 MY_UNICASE_CHARACTER *page;
1309 if ((page= uni_plane->page[*wc >> 8]))
1310 *wc= page[*wc & 0xFF].sort;
1311 }
1312 else
1313 {
1314 *wc= MY_CS_REPLACEMENT_CHARACTER;
1315 }
1316 }
1317
1318
1319
1320 static size_t
my_caseup_utf16(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)1321 my_caseup_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
1322 char *dst, size_t dstlen)
1323 {
1324 my_wc_t wc;
1325 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1326 my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
1327 int res;
1328 const char *srcend= src + srclen;
1329 char *dstend= dst + dstlen;
1330 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1331 DBUG_ASSERT(srclen <= dstlen);
1332
1333 while ((src < srcend) &&
1334 (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1335 {
1336 my_toupper_utf16(uni_plane, &wc);
1337 if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
1338 break;
1339 src+= res;
1340 dst+= res;
1341 }
1342 return srclen;
1343 }
1344
1345
1346 static void
my_hash_sort_utf16_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)1347 my_hash_sort_utf16_nopad(CHARSET_INFO *cs,
1348 const uchar *s, size_t slen,
1349 ulong *nr1, ulong *nr2)
1350 {
1351 my_wc_t wc;
1352 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1353 int res;
1354 const uchar *e= s + slen;
1355 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1356 register ulong m1= *nr1, m2= *nr2;
1357
1358 while ((s < e) && (res= mb_wc(cs, &wc, (uchar *) s, (uchar *) e)) > 0)
1359 {
1360 my_tosort_utf16(uni_plane, &wc);
1361 MY_HASH_ADD_16(m1, m2, wc);
1362 s+= res;
1363 }
1364 *nr1= m1;
1365 *nr2= m2;
1366 }
1367
1368
1369 static void
my_hash_sort_utf16(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)1370 my_hash_sort_utf16(CHARSET_INFO *cs, const uchar *s, size_t slen,
1371 ulong *nr1, ulong *nr2)
1372 {
1373 size_t lengthsp= cs->cset->lengthsp(cs, (const char *) s, slen);
1374 my_hash_sort_utf16_nopad(cs, s, lengthsp, nr1, nr2);
1375 }
1376
1377
1378 static size_t
my_casedn_utf16(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)1379 my_casedn_utf16(CHARSET_INFO *cs, const char *src, size_t srclen,
1380 char *dst, size_t dstlen)
1381 {
1382 my_wc_t wc;
1383 my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
1384 my_charset_conv_wc_mb wc_mb= cs->cset->wc_mb;
1385 int res;
1386 const char *srcend= src + srclen;
1387 char *dstend= dst + dstlen;
1388 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1389 DBUG_ASSERT(srclen <= dstlen);
1390
1391 while ((src < srcend) &&
1392 (res= mb_wc(cs, &wc, (uchar *) src, (uchar *) srcend)) > 0)
1393 {
1394 my_tolower_utf16(uni_plane, &wc);
1395 if (res != wc_mb(cs, wc, (uchar *) dst, (uchar *) dstend))
1396 break;
1397 src+= res;
1398 dst+= res;
1399 }
1400 return srclen;
1401 }
1402
1403
1404 static int
my_charlen_utf16(CHARSET_INFO * cs,const uchar * str,const uchar * end)1405 my_charlen_utf16(CHARSET_INFO *cs, const uchar *str, const uchar *end)
1406 {
1407 my_wc_t wc;
1408 return cs->cset->mb_wc(cs, &wc, str, end);
1409 }
1410
1411
1412 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16
1413 #define CHARLEN(cs,str,end) my_charlen_utf16(cs,str,end)
1414 #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
1415 #include "ctype-mb.inl"
1416 #undef MY_FUNCTION_NAME
1417 #undef CHARLEN
1418 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
1419 /* Defines my_well_formed_char_length_utf16 */
1420
1421
1422 static size_t
my_numchars_utf16(CHARSET_INFO * cs,const char * b,const char * e)1423 my_numchars_utf16(CHARSET_INFO *cs,
1424 const char *b, const char *e)
1425 {
1426 size_t nchars= 0;
1427 for ( ; ; nchars++)
1428 {
1429 size_t charlen= my_ismbchar(cs, b, e);
1430 if (!charlen)
1431 break;
1432 b+= charlen;
1433 }
1434 return nchars;
1435 }
1436
1437
1438 static size_t
my_charpos_utf16(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)1439 my_charpos_utf16(CHARSET_INFO *cs,
1440 const char *b, const char *e, size_t pos)
1441 {
1442 const char *b0= b;
1443 uint charlen;
1444
1445 for ( ; pos; b+= charlen, pos--)
1446 {
1447 if (!(charlen= my_ismbchar(cs, b, e)))
1448 return (e + 2 - b0); /* Error, return pos outside the string */
1449 }
1450 return (size_t) (pos ? (e + 2 - b0) : (b - b0));
1451 }
1452
1453
1454 static int
my_wildcmp_utf16_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1455 my_wildcmp_utf16_ci(CHARSET_INFO *cs,
1456 const char *str,const char *str_end,
1457 const char *wildstr,const char *wildend,
1458 int escape, int w_one, int w_many)
1459 {
1460 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
1461 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1462 escape, w_one, w_many, uni_plane);
1463 }
1464
1465
1466 static int
my_wildcmp_utf16_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)1467 my_wildcmp_utf16_bin(CHARSET_INFO *cs,
1468 const char *str,const char *str_end,
1469 const char *wildstr,const char *wildend,
1470 int escape, int w_one, int w_many)
1471 {
1472 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
1473 escape, w_one, w_many, NULL);
1474 }
1475
1476
1477 static void
my_hash_sort_utf16_nopad_bin(CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1478 my_hash_sort_utf16_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
1479 const uchar *pos, size_t len,
1480 ulong *nr1, ulong *nr2)
1481 {
1482 const uchar *end= pos + len;
1483 register ulong m1= *nr1, m2= *nr2;
1484
1485 for ( ; pos < end ; pos++)
1486 {
1487 MY_HASH_ADD(m1, m2, (uint)*pos);
1488 }
1489 *nr1= m1;
1490 *nr2= m2;
1491 }
1492
1493
1494 static void
my_hash_sort_utf16_bin(CHARSET_INFO * cs,const uchar * pos,size_t len,ulong * nr1,ulong * nr2)1495 my_hash_sort_utf16_bin(CHARSET_INFO *cs,
1496 const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
1497 {
1498 size_t lengthsp= cs->cset->lengthsp(cs, (const char *) pos, len);
1499 my_hash_sort_utf16_nopad_bin(cs, pos, lengthsp, nr1, nr2);
1500 }
1501
1502
1503 static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
1504 {
1505 NULL, /* init */
1506 my_strnncoll_utf16_general_ci,
1507 my_strnncollsp_utf16_general_ci,
1508 my_strnncollsp_nchars_utf16_general_ci,
1509 my_strnxfrm_utf16_general_ci,
1510 my_strnxfrmlen_unicode,
1511 my_like_range_generic,
1512 my_wildcmp_utf16_ci,
1513 my_strcasecmp_mb2_or_mb4,
1514 my_instr_mb,
1515 my_hash_sort_utf16,
1516 my_propagate_simple
1517 };
1518
1519
1520 static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
1521 {
1522 NULL, /* init */
1523 my_strnncoll_utf16_bin,
1524 my_strnncollsp_utf16_bin,
1525 my_strnncollsp_nchars_utf16_bin,
1526 my_strnxfrm_unicode_full_bin,
1527 my_strnxfrmlen_unicode_full_bin,
1528 my_like_range_generic,
1529 my_wildcmp_utf16_bin,
1530 my_strcasecmp_mb2_or_mb4,
1531 my_instr_mb,
1532 my_hash_sort_utf16_bin,
1533 my_propagate_simple
1534 };
1535
1536
1537 static MY_COLLATION_HANDLER my_collation_utf16_general_nopad_ci_handler =
1538 {
1539 NULL, /* init */
1540 my_strnncoll_utf16_general_ci,
1541 my_strnncollsp_utf16_general_nopad_ci,
1542 my_strnncollsp_nchars_utf16_general_nopad_ci,
1543 my_strnxfrm_nopad_utf16_general_ci,
1544 my_strnxfrmlen_unicode,
1545 my_like_range_generic,
1546 my_wildcmp_utf16_ci,
1547 my_strcasecmp_mb2_or_mb4,
1548 my_instr_mb,
1549 my_hash_sort_utf16_nopad,
1550 my_propagate_simple
1551 };
1552
1553
1554 static MY_COLLATION_HANDLER my_collation_utf16_nopad_bin_handler =
1555 {
1556 NULL, /* init */
1557 my_strnncoll_utf16_bin,
1558 my_strnncollsp_utf16_nopad_bin,
1559 my_strnncollsp_nchars_utf16_nopad_bin,
1560 my_strnxfrm_unicode_full_nopad_bin,
1561 my_strnxfrmlen_unicode_full_bin,
1562 my_like_range_generic,
1563 my_wildcmp_utf16_bin,
1564 my_strcasecmp_mb2_or_mb4,
1565 my_instr_mb,
1566 my_hash_sort_utf16_nopad_bin,
1567 my_propagate_simple
1568 };
1569
1570
1571 MY_CHARSET_HANDLER my_charset_utf16_handler=
1572 {
1573 NULL, /* init */
1574 my_numchars_utf16,
1575 my_charpos_utf16,
1576 my_lengthsp_mb2,
1577 my_numcells_mb,
1578 my_utf16_uni, /* mb_wc */
1579 my_uni_utf16, /* wc_mb */
1580 my_mb_ctype_mb,
1581 my_caseup_str_mb2_or_mb4,
1582 my_casedn_str_mb2_or_mb4,
1583 my_caseup_utf16,
1584 my_casedn_utf16,
1585 my_snprintf_mb2,
1586 my_l10tostr_mb2_or_mb4,
1587 my_ll10tostr_mb2_or_mb4,
1588 my_fill_mb2,
1589 my_strntol_mb2_or_mb4,
1590 my_strntoul_mb2_or_mb4,
1591 my_strntoll_mb2_or_mb4,
1592 my_strntoull_mb2_or_mb4,
1593 my_strntod_mb2_or_mb4,
1594 my_strtoll10_mb2,
1595 my_strntoull10rnd_mb2_or_mb4,
1596 my_scan_mb2,
1597 my_charlen_utf16,
1598 my_well_formed_char_length_utf16,
1599 my_copy_fix_mb2_or_mb4,
1600 my_uni_utf16,
1601 };
1602
1603
1604 struct charset_info_st my_charset_utf16_general_ci=
1605 {
1606 54,0,0, /* number */
1607 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1608 "utf16", /* cs name */
1609 "utf16_general_ci", /* name */
1610 "UTF-16 Unicode", /* comment */
1611 NULL, /* tailoring */
1612 NULL, /* ctype */
1613 NULL, /* to_lower */
1614 NULL, /* to_upper */
1615 NULL, /* sort_order */
1616 NULL, /* uca */
1617 NULL, /* tab_to_uni */
1618 NULL, /* tab_from_uni */
1619 &my_unicase_default, /* caseinfo */
1620 NULL, /* state_map */
1621 NULL, /* ident_map */
1622 1, /* strxfrm_multiply */
1623 1, /* caseup_multiply */
1624 1, /* casedn_multiply */
1625 2, /* mbminlen */
1626 4, /* mbmaxlen */
1627 0, /* min_sort_char */
1628 0xFFFF, /* max_sort_char */
1629 ' ', /* pad char */
1630 0, /* escape_with_backslash_is_dangerous */
1631 1, /* levels_for_order */
1632 &my_charset_utf16_handler,
1633 &my_collation_utf16_general_ci_handler
1634 };
1635
1636
1637 struct charset_info_st my_charset_utf16_bin=
1638 {
1639 55,0,0, /* number */
1640 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1641 "utf16", /* cs name */
1642 "utf16_bin", /* name */
1643 "UTF-16 Unicode", /* comment */
1644 NULL, /* tailoring */
1645 NULL, /* ctype */
1646 NULL, /* to_lower */
1647 NULL, /* to_upper */
1648 NULL, /* sort_order */
1649 NULL, /* uca */
1650 NULL, /* tab_to_uni */
1651 NULL, /* tab_from_uni */
1652 &my_unicase_default, /* caseinfo */
1653 NULL, /* state_map */
1654 NULL, /* ident_map */
1655 1, /* strxfrm_multiply */
1656 1, /* caseup_multiply */
1657 1, /* casedn_multiply */
1658 2, /* mbminlen */
1659 4, /* mbmaxlen */
1660 0, /* min_sort_char */
1661 0xFFFF, /* max_sort_char */
1662 ' ', /* pad char */
1663 0, /* escape_with_backslash_is_dangerous */
1664 1, /* levels_for_order */
1665 &my_charset_utf16_handler,
1666 &my_collation_utf16_bin_handler
1667 };
1668
1669
1670 struct charset_info_st my_charset_utf16_general_nopad_ci=
1671 {
1672 MY_NOPAD_ID(54),0,0, /* number */
1673 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
1674 "utf16", /* cs name */
1675 "utf16_general_nopad_ci", /* name */
1676 "UTF-16 Unicode", /* comment */
1677 NULL, /* tailoring */
1678 NULL, /* ctype */
1679 NULL, /* to_lower */
1680 NULL, /* to_upper */
1681 NULL, /* sort_order */
1682 NULL, /* uca */
1683 NULL, /* tab_to_uni */
1684 NULL, /* tab_from_uni */
1685 &my_unicase_default, /* caseinfo */
1686 NULL, /* state_map */
1687 NULL, /* ident_map */
1688 1, /* strxfrm_multiply */
1689 1, /* caseup_multiply */
1690 1, /* casedn_multiply */
1691 2, /* mbminlen */
1692 4, /* mbmaxlen */
1693 0, /* min_sort_char */
1694 0xFFFF, /* max_sort_char */
1695 ' ', /* pad char */
1696 0, /* escape_with_backslash_is_dangerous */
1697 1, /* levels_for_order */
1698 &my_charset_utf16_handler,
1699 &my_collation_utf16_general_nopad_ci_handler
1700 };
1701
1702
1703 struct charset_info_st my_charset_utf16_nopad_bin=
1704 {
1705 MY_NOPAD_ID(55),0,0, /* number */
1706 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
1707 MY_CS_NOPAD,
1708 "utf16", /* cs name */
1709 "utf16_nopad_bin", /* name */
1710 "UTF-16 Unicode", /* comment */
1711 NULL, /* tailoring */
1712 NULL, /* ctype */
1713 NULL, /* to_lower */
1714 NULL, /* to_upper */
1715 NULL, /* sort_order */
1716 NULL, /* uca */
1717 NULL, /* tab_to_uni */
1718 NULL, /* tab_from_uni */
1719 &my_unicase_default, /* caseinfo */
1720 NULL, /* state_map */
1721 NULL, /* ident_map */
1722 1, /* strxfrm_multiply */
1723 1, /* caseup_multiply */
1724 1, /* casedn_multiply */
1725 2, /* mbminlen */
1726 4, /* mbmaxlen */
1727 0, /* min_sort_char */
1728 0xFFFF, /* max_sort_char */
1729 ' ', /* pad char */
1730 0, /* escape_with_backslash_is_dangerous */
1731 1, /* levels_for_order */
1732 &my_charset_utf16_handler,
1733 &my_collation_utf16_nopad_bin_handler
1734 };
1735
1736
1737 #define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b1))
1738 #define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3))
1739
1740 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci
1741 #define DEFINE_STRNXFRM_UNICODE
1742 #define DEFINE_STRNXFRM_UNICODE_NOPAD
1743 #define MY_MB_WC(cs, pwc, s, e) (cs->cset->mb_wc(cs, pwc, s, e))
1744 #define OPTIMIZE_ASCII 0
1745 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
1746 #define UNICASE_PAGE0 my_unicase_default_page00
1747 #define UNICASE_PAGES my_unicase_default_pages
1748 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1749 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
1750 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1751 #include "strcoll.inl"
1752
1753 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_bin
1754 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1755 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0))
1756 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
1757 #include "strcoll.inl"
1758
1759 #define DEFINE_STRNNCOLLSP_NOPAD
1760 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_nopad_ci
1761 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1762 #define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
1763 #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
1764 #include "strcoll.inl"
1765
1766 #define DEFINE_STRNNCOLLSP_NOPAD
1767 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_nopad_bin
1768 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
1769 #define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0))
1770 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
1771 #include "strcoll.inl"
1772
1773 #undef IS_MB2_CHAR
1774 #undef IS_MB4_CHAR
1775
1776 static int
my_utf16le_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)1777 my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)),
1778 my_wc_t *pwc, const uchar *s, const uchar *e)
1779 {
1780 my_wc_t lo;
1781
1782 if (s + 2 > e)
1783 return MY_CS_TOOSMALL2;
1784
1785 if ((*pwc= uint2korr(s)) < MY_UTF16_SURROGATE_HIGH_FIRST ||
1786 (*pwc > MY_UTF16_SURROGATE_LOW_LAST))
1787 return 2; /* [0000-D7FF,E000-FFFF] */
1788
1789 if (*pwc >= MY_UTF16_SURROGATE_LOW_FIRST)
1790 return MY_CS_ILSEQ; /* [DC00-DFFF] Low surrogate part without high part */
1791
1792 if (s + 4 > e)
1793 return MY_CS_TOOSMALL4;
1794
1795 s+= 2;
1796
1797 if ((lo= uint2korr(s)) < MY_UTF16_SURROGATE_LOW_FIRST ||
1798 lo > MY_UTF16_SURROGATE_LOW_LAST)
1799 return MY_CS_ILSEQ; /* Expected low surrogate part, got something else */
1800
1801 *pwc= 0x10000 + (((*pwc & 0x3FF) << 10) | (lo & 0x3FF));
1802 return 4;
1803 }
1804
1805
1806 static int
my_uni_utf16le(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)1807 my_uni_utf16le(CHARSET_INFO *cs __attribute__((unused)),
1808 my_wc_t wc, uchar *s, uchar *e)
1809 {
1810 uint32 first, second, total;
1811 if (wc < MY_UTF16_SURROGATE_HIGH_FIRST ||
1812 (wc > MY_UTF16_SURROGATE_LOW_LAST &&
1813 wc <= 0xFFFF))
1814 {
1815 if (s + 2 > e)
1816 return MY_CS_TOOSMALL2;
1817 int2store(s, wc);
1818 return 2; /* [0000-D7FF,E000-FFFF] */
1819 }
1820
1821 if (wc < 0xFFFF || wc > 0x10FFFF)
1822 return MY_CS_ILUNI; /* [D800-DFFF,10FFFF+] */
1823
1824 if (s + 4 > e)
1825 return MY_CS_TOOSMALL4;
1826
1827 wc-= 0x10000;
1828 first= (0xD800 | ((wc >> 10) & 0x3FF));
1829 second= (0xDC00 | (wc & 0x3FF));
1830 total= first | (second << 16);
1831 int4store(s, total);
1832 return 4; /* [010000-10FFFF] */
1833 }
1834
1835
1836 static size_t
my_lengthsp_utf16le(CHARSET_INFO * cs,const char * ptr,size_t length)1837 my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)),
1838 const char *ptr, size_t length)
1839 {
1840 const char *end= ptr + length;
1841 while (end > ptr + 1 && uint2korr(end - 2) == ' ')
1842 end-= 2;
1843 return (size_t) (end - ptr);
1844 }
1845
1846
1847 static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler =
1848 {
1849 NULL, /* init */
1850 my_strnncoll_utf16le_general_ci,
1851 my_strnncollsp_utf16le_general_ci,
1852 my_strnncollsp_nchars_utf16le_general_ci,
1853 my_strnxfrm_utf16le_general_ci,
1854 my_strnxfrmlen_unicode,
1855 my_like_range_generic,
1856 my_wildcmp_utf16_ci,
1857 my_strcasecmp_mb2_or_mb4,
1858 my_instr_mb,
1859 my_hash_sort_utf16,
1860 my_propagate_simple
1861 };
1862
1863
1864 static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler =
1865 {
1866 NULL, /* init */
1867 my_strnncoll_utf16le_bin,
1868 my_strnncollsp_utf16le_bin,
1869 my_strnncollsp_nchars_utf16le_bin,
1870 my_strnxfrm_unicode_full_bin,
1871 my_strnxfrmlen_unicode_full_bin,
1872 my_like_range_generic,
1873 my_wildcmp_utf16_bin,
1874 my_strcasecmp_mb2_or_mb4,
1875 my_instr_mb,
1876 my_hash_sort_utf16_bin,
1877 my_propagate_simple
1878 };
1879
1880
1881 static MY_COLLATION_HANDLER my_collation_utf16le_general_nopad_ci_handler =
1882 {
1883 NULL, /* init */
1884 my_strnncoll_utf16le_general_ci,
1885 my_strnncollsp_utf16le_general_nopad_ci,
1886 my_strnncollsp_nchars_utf16le_general_nopad_ci,
1887 my_strnxfrm_nopad_utf16le_general_ci,
1888 my_strnxfrmlen_unicode,
1889 my_like_range_generic,
1890 my_wildcmp_utf16_ci,
1891 my_strcasecmp_mb2_or_mb4,
1892 my_instr_mb,
1893 my_hash_sort_utf16_nopad,
1894 my_propagate_simple
1895 };
1896
1897
1898 static MY_COLLATION_HANDLER my_collation_utf16le_nopad_bin_handler =
1899 {
1900 NULL, /* init */
1901 my_strnncoll_utf16le_bin,
1902 my_strnncollsp_utf16le_nopad_bin,
1903 my_strnncollsp_nchars_utf16le_nopad_bin,
1904 my_strnxfrm_unicode_full_nopad_bin,
1905 my_strnxfrmlen_unicode_full_bin,
1906 my_like_range_generic,
1907 my_wildcmp_utf16_bin,
1908 my_strcasecmp_mb2_or_mb4,
1909 my_instr_mb,
1910 my_hash_sort_utf16_nopad_bin,
1911 my_propagate_simple
1912 };
1913
1914
1915 static MY_CHARSET_HANDLER my_charset_utf16le_handler=
1916 {
1917 NULL, /* init */
1918 my_numchars_utf16,
1919 my_charpos_utf16,
1920 my_lengthsp_utf16le,
1921 my_numcells_mb,
1922 my_utf16le_uni, /* mb_wc */
1923 my_uni_utf16le, /* wc_mb */
1924 my_mb_ctype_mb,
1925 my_caseup_str_mb2_or_mb4,
1926 my_casedn_str_mb2_or_mb4,
1927 my_caseup_utf16,
1928 my_casedn_utf16,
1929 my_snprintf_mb2,
1930 my_l10tostr_mb2_or_mb4,
1931 my_ll10tostr_mb2_or_mb4,
1932 my_fill_mb2,
1933 my_strntol_mb2_or_mb4,
1934 my_strntoul_mb2_or_mb4,
1935 my_strntoll_mb2_or_mb4,
1936 my_strntoull_mb2_or_mb4,
1937 my_strntod_mb2_or_mb4,
1938 my_strtoll10_mb2,
1939 my_strntoull10rnd_mb2_or_mb4,
1940 my_scan_mb2,
1941 my_charlen_utf16,
1942 my_well_formed_char_length_utf16,
1943 my_copy_fix_mb2_or_mb4,
1944 my_uni_utf16le,
1945 };
1946
1947
1948 struct charset_info_st my_charset_utf16le_general_ci=
1949 {
1950 56,0,0, /* number */
1951 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1952 "utf16le", /* cs name */
1953 "utf16le_general_ci",/* name */
1954 "UTF-16LE Unicode", /* comment */
1955 NULL, /* tailoring */
1956 NULL, /* ctype */
1957 NULL, /* to_lower */
1958 NULL, /* to_upper */
1959 NULL, /* sort_order */
1960 NULL, /* uca */
1961 NULL, /* tab_to_uni */
1962 NULL, /* tab_from_uni */
1963 &my_unicase_default, /* caseinfo */
1964 NULL, /* state_map */
1965 NULL, /* ident_map */
1966 1, /* strxfrm_multiply */
1967 1, /* caseup_multiply */
1968 1, /* casedn_multiply */
1969 2, /* mbminlen */
1970 4, /* mbmaxlen */
1971 0, /* min_sort_char */
1972 0xFFFF, /* max_sort_char */
1973 ' ', /* pad char */
1974 0, /* escape_with_backslash_is_dangerous */
1975 1, /* levels_for_order */
1976 &my_charset_utf16le_handler,
1977 &my_collation_utf16le_general_ci_handler
1978 };
1979
1980
1981 struct charset_info_st my_charset_utf16le_bin=
1982 {
1983 62,0,0, /* number */
1984 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
1985 "utf16le", /* cs name */
1986 "utf16le_bin", /* name */
1987 "UTF-16LE Unicode", /* comment */
1988 NULL, /* tailoring */
1989 NULL, /* ctype */
1990 NULL, /* to_lower */
1991 NULL, /* to_upper */
1992 NULL, /* sort_order */
1993 NULL, /* uca */
1994 NULL, /* tab_to_uni */
1995 NULL, /* tab_from_uni */
1996 &my_unicase_default, /* caseinfo */
1997 NULL, /* state_map */
1998 NULL, /* ident_map */
1999 1, /* strxfrm_multiply */
2000 1, /* caseup_multiply */
2001 1, /* casedn_multiply */
2002 2, /* mbminlen */
2003 4, /* mbmaxlen */
2004 0, /* min_sort_char */
2005 0xFFFF, /* max_sort_char */
2006 ' ', /* pad char */
2007 0, /* escape_with_backslash_is_dangerous */
2008 1, /* levels_for_order */
2009 &my_charset_utf16le_handler,
2010 &my_collation_utf16le_bin_handler
2011 };
2012
2013
2014 struct charset_info_st my_charset_utf16le_general_nopad_ci=
2015 {
2016 MY_NOPAD_ID(56),0,0, /* number */
2017 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
2018 "utf16le", /* cs name */
2019 "utf16le_general_nopad_ci",/* name */
2020 "UTF-16LE Unicode", /* comment */
2021 NULL, /* tailoring */
2022 NULL, /* ctype */
2023 NULL, /* to_lower */
2024 NULL, /* to_upper */
2025 NULL, /* sort_order */
2026 NULL, /* uca */
2027 NULL, /* tab_to_uni */
2028 NULL, /* tab_from_uni */
2029 &my_unicase_default, /* caseinfo */
2030 NULL, /* state_map */
2031 NULL, /* ident_map */
2032 1, /* strxfrm_multiply */
2033 1, /* caseup_multiply */
2034 1, /* casedn_multiply */
2035 2, /* mbminlen */
2036 4, /* mbmaxlen */
2037 0, /* min_sort_char */
2038 0xFFFF, /* max_sort_char */
2039 ' ', /* pad char */
2040 0, /* escape_with_backslash_is_dangerous */
2041 1, /* levels_for_order */
2042 &my_charset_utf16le_handler,
2043 &my_collation_utf16le_general_nopad_ci_handler
2044 };
2045
2046
2047 struct charset_info_st my_charset_utf16le_nopad_bin=
2048 {
2049 MY_NOPAD_ID(62),0,0, /* number */
2050 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
2051 MY_CS_NOPAD,
2052 "utf16le", /* cs name */
2053 "utf16le_nopad_bin", /* name */
2054 "UTF-16LE Unicode", /* comment */
2055 NULL, /* tailoring */
2056 NULL, /* ctype */
2057 NULL, /* to_lower */
2058 NULL, /* to_upper */
2059 NULL, /* sort_order */
2060 NULL, /* uca */
2061 NULL, /* tab_to_uni */
2062 NULL, /* tab_from_uni */
2063 &my_unicase_default, /* caseinfo */
2064 NULL, /* state_map */
2065 NULL, /* ident_map */
2066 1, /* strxfrm_multiply */
2067 1, /* caseup_multiply */
2068 1, /* casedn_multiply */
2069 2, /* mbminlen */
2070 4, /* mbmaxlen */
2071 0, /* min_sort_char */
2072 0xFFFF, /* max_sort_char */
2073 ' ', /* pad char */
2074 0, /* escape_with_backslash_is_dangerous */
2075 1, /* levels_for_order */
2076 &my_charset_utf16le_handler,
2077 &my_collation_utf16le_nopad_bin_handler
2078 };
2079
2080
2081 #endif /* HAVE_CHARSET_utf16 */
2082
2083
2084 #ifdef HAVE_CHARSET_utf32
2085
2086 #include "ctype-utf32.h"
2087
2088 /*
2089 Check is b0 and b1 start a valid UTF32 four-byte sequence.
2090 Don't accept characters greater than U+10FFFF.
2091 */
2092 #define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
2093
2094 #define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
2095
2096
my_weight_utf32_general_ci(uchar b0,uchar b1,uchar b2,uchar b3)2097 static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
2098 uchar b2, uchar b3)
2099 {
2100 my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
2101 if (wc <= 0xFFFF)
2102 {
2103 MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
2104 return (int) (page ? page[wc & 0xFF].sort : wc);
2105 }
2106 return MY_CS_REPLACEMENT_CHARACTER;
2107 }
2108 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
2109 #define DEFINE_STRNXFRM_UNICODE
2110 #define DEFINE_STRNXFRM_UNICODE_NOPAD
2111 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf32_quick(pwc, s, e)
2112 #define OPTIMIZE_ASCII 0
2113 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
2114 #define UNICASE_PAGE0 my_unicase_default_page00
2115 #define UNICASE_PAGES my_unicase_default_pages
2116 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2117 #define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
2118 #include "strcoll.inl"
2119
2120 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin
2121 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2122 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
2123 #include "strcoll.inl"
2124
2125 #define DEFINE_STRNNCOLLSP_NOPAD
2126 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_nopad_ci
2127 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2128 #define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
2129 #include "strcoll.inl"
2130
2131 #define DEFINE_STRNNCOLLSP_NOPAD
2132 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_nopad_bin
2133 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2134 #define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
2135 #include "strcoll.inl"
2136
2137 #undef IS_MB2_CHAR
2138 #undef IS_MB4_CHAR
2139
2140
2141 static int
my_utf32_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)2142 my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
2143 my_wc_t *pwc, const uchar *s, const uchar *e)
2144 {
2145 return my_mb_wc_utf32_quick(pwc, s, e);
2146 }
2147
2148
2149 static int
my_uni_utf32(CHARSET_INFO * cs,my_wc_t wc,uchar * s,uchar * e)2150 my_uni_utf32(CHARSET_INFO *cs __attribute__((unused)),
2151 my_wc_t wc, uchar *s, uchar *e)
2152 {
2153 if (s + 4 > e)
2154 return MY_CS_TOOSMALL4;
2155
2156 if (wc > 0x10FFFF)
2157 return MY_CS_ILUNI;
2158
2159 s[0]= (uchar) (wc >> 24);
2160 s[1]= (uchar) (wc >> 16) & 0xFF;
2161 s[2]= (uchar) (wc >> 8) & 0xFF;
2162 s[3]= (uchar) wc & 0xFF;
2163 return 4;
2164 }
2165
2166
2167 static inline void
my_tolower_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2168 my_tolower_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2169 {
2170 MY_UNICASE_CHARACTER *page;
2171 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
2172 *wc= page[*wc & 0xFF].tolower;
2173 }
2174
2175
2176 static inline void
my_toupper_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2177 my_toupper_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2178 {
2179 MY_UNICASE_CHARACTER *page;
2180 if ((*wc <= uni_plane->maxchar) && (page= uni_plane->page[*wc >> 8]))
2181 *wc= page[*wc & 0xFF].toupper;
2182 }
2183
2184
2185 static inline void
my_tosort_utf32(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)2186 my_tosort_utf32(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
2187 {
2188 if (*wc <= uni_plane->maxchar)
2189 {
2190 MY_UNICASE_CHARACTER *page;
2191 if ((page= uni_plane->page[*wc >> 8]))
2192 *wc= page[*wc & 0xFF].sort;
2193 }
2194 else
2195 {
2196 *wc= MY_CS_REPLACEMENT_CHARACTER;
2197 }
2198 }
2199
2200
2201 static size_t
my_lengthsp_utf32(CHARSET_INFO * cs,const char * ptr,size_t length)2202 my_lengthsp_utf32(CHARSET_INFO *cs __attribute__((unused)),
2203 const char *ptr, size_t length)
2204 {
2205 const char *end= ptr + length;
2206 DBUG_ASSERT((length % 4) == 0);
2207 while (end > ptr + 3 && end[-1] == ' ' && !end[-2] && !end[-3] && !end[-4])
2208 end-= 4;
2209 return (size_t) (end - ptr);
2210 }
2211
2212
2213 static size_t
my_caseup_utf32(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)2214 my_caseup_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
2215 char *dst, size_t dstlen)
2216 {
2217 my_wc_t wc;
2218 int res;
2219 const char *srcend= src + srclen;
2220 char *dstend= dst + dstlen;
2221 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2222 DBUG_ASSERT(srclen <= dstlen);
2223
2224 while ((src < srcend) &&
2225 (res= my_utf32_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
2226 {
2227 my_toupper_utf32(uni_plane, &wc);
2228 if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
2229 break;
2230 src+= res;
2231 dst+= res;
2232 }
2233 return srclen;
2234 }
2235
2236
2237 static void
my_hash_sort_utf32_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)2238 my_hash_sort_utf32_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
2239 ulong *nr1, ulong *nr2)
2240 {
2241 my_wc_t wc;
2242 int res;
2243 const uchar *e= s + slen;
2244 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2245 register ulong m1= *nr1, m2= *nr2;
2246
2247 while ((res= my_utf32_uni(cs, &wc, (uchar*) s, (uchar*) e)) > 0)
2248 {
2249 my_tosort_utf32(uni_plane, &wc);
2250 MY_HASH_ADD(m1, m2, (uint) (wc >> 24));
2251 MY_HASH_ADD(m1, m2, (uint) (wc >> 16) & 0xFF);
2252 MY_HASH_ADD(m1, m2, (uint) (wc >> 8) & 0xFF);
2253 MY_HASH_ADD(m1, m2, (uint) (wc & 0xFF));
2254 s+= res;
2255 }
2256 *nr1= m1;
2257 *nr2= m2;
2258 }
2259
2260
2261 static void
my_hash_sort_utf32(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)2262 my_hash_sort_utf32(CHARSET_INFO *cs, const uchar *s, size_t slen,
2263 ulong *nr1, ulong *nr2)
2264 {
2265 size_t lengthsp= my_lengthsp_utf32(cs, (const char *) s, slen);
2266 my_hash_sort_utf32_nopad(cs, s, lengthsp, nr1, nr2);
2267 }
2268
2269
2270 static size_t
my_casedn_utf32(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)2271 my_casedn_utf32(CHARSET_INFO *cs, const char *src, size_t srclen,
2272 char *dst, size_t dstlen)
2273 {
2274 my_wc_t wc;
2275 int res;
2276 const char *srcend= src + srclen;
2277 char *dstend= dst + dstlen;
2278 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2279 DBUG_ASSERT(srclen <= dstlen);
2280
2281 while ((res= my_utf32_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
2282 {
2283 my_tolower_utf32(uni_plane,&wc);
2284 if (res != my_uni_utf32(cs, wc, (uchar*) dst, (uchar*) dstend))
2285 break;
2286 src+= res;
2287 dst+= res;
2288 }
2289 return srclen;
2290 }
2291
2292
2293 static int
my_charlen_utf32(CHARSET_INFO * cs,const uchar * b,const uchar * e)2294 my_charlen_utf32(CHARSET_INFO *cs __attribute__((unused)),
2295 const uchar *b, const uchar *e)
2296 {
2297 return b + 4 > e ? MY_CS_TOOSMALL4 :
2298 IS_UTF32_MBHEAD4(b[0], b[1]) ? 4 : MY_CS_ILSEQ;
2299 }
2300
2301
2302 #define MY_FUNCTION_NAME(x) my_ ## x ## _utf32
2303 #define CHARLEN(cs,str,end) my_charlen_utf32(cs,str,end)
2304 #define DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
2305 #include "ctype-mb.inl"
2306 #undef MY_FUNCTION_NAME
2307 #undef CHARLEN
2308 #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
2309 /* Defines my_well_formed_char_length_utf32 */
2310
2311
2312 static size_t
my_vsnprintf_utf32(char * dst,size_t n,const char * fmt,va_list ap)2313 my_vsnprintf_utf32(char *dst, size_t n, const char* fmt, va_list ap)
2314 {
2315 char *start= dst, *end= dst + n;
2316 DBUG_ASSERT((n % 4) == 0);
2317 for (; *fmt ; fmt++)
2318 {
2319 if (fmt[0] != '%')
2320 {
2321 if (dst >= end) /* End of buffer */
2322 break;
2323
2324 *dst++= '\0';
2325 *dst++= '\0';
2326 *dst++= '\0';
2327 *dst++= *fmt; /* Copy ordinary char */
2328 continue;
2329 }
2330
2331 fmt++;
2332
2333 /* Skip if max size is used (to be compatible with printf) */
2334 while ( (*fmt>='0' && *fmt<='9') || *fmt == '.' || *fmt == '-')
2335 fmt++;
2336
2337 if (*fmt == 'l')
2338 fmt++;
2339
2340 if (*fmt == 's') /* String parameter */
2341 {
2342 reg2 char *par= va_arg(ap, char *);
2343 size_t plen;
2344 size_t left_len= (size_t)(end - dst);
2345 if (!par) par= (char*)"(null)";
2346 plen= strlen(par);
2347 if (left_len <= plen*4)
2348 plen= left_len / 4 - 1;
2349
2350 for ( ; plen ; plen--, dst+= 4, par++)
2351 {
2352 dst[0]= '\0';
2353 dst[1]= '\0';
2354 dst[2]= '\0';
2355 dst[3]= par[0];
2356 }
2357 continue;
2358 }
2359 else if (*fmt == 'd' || *fmt == 'u') /* Integer parameter */
2360 {
2361 register int iarg;
2362 char nbuf[16];
2363 char *pbuf= nbuf;
2364
2365 if ((size_t) (end - dst) < 64)
2366 break;
2367 iarg= va_arg(ap, int);
2368 if (*fmt == 'd')
2369 int10_to_str((long) iarg, nbuf, -10);
2370 else
2371 int10_to_str((long) (uint) iarg,nbuf,10);
2372
2373 for (; pbuf[0]; pbuf++)
2374 {
2375 *dst++= '\0';
2376 *dst++= '\0';
2377 *dst++= '\0';
2378 *dst++= *pbuf;
2379 }
2380 continue;
2381 }
2382
2383 /* We come here on '%%', unknown code or too long parameter */
2384 if (dst == end)
2385 break;
2386 *dst++= '\0';
2387 *dst++= '\0';
2388 *dst++= '\0';
2389 *dst++= '%'; /* % used as % or unknown code */
2390 }
2391
2392 DBUG_ASSERT(dst < end);
2393 *dst++= '\0';
2394 *dst++= '\0';
2395 *dst++= '\0';
2396 *dst++= '\0'; /* End of errmessage */
2397 return (size_t) (dst - start - 4);
2398 }
2399
2400
2401 static size_t
my_snprintf_utf32(CHARSET_INFO * cs,char * to,size_t n,const char * fmt,...)2402 my_snprintf_utf32(CHARSET_INFO *cs __attribute__((unused)),
2403 char* to, size_t n, const char* fmt, ...)
2404 {
2405 size_t ret;
2406 va_list args;
2407 va_start(args,fmt);
2408 ret= my_vsnprintf_utf32(to, n, fmt, args);
2409 va_end(args);
2410 return ret;
2411 }
2412
2413
2414 static longlong
my_strtoll10_utf32(CHARSET_INFO * cs,const char * nptr,char ** endptr,int * error)2415 my_strtoll10_utf32(CHARSET_INFO *cs __attribute__((unused)),
2416 const char *nptr, char **endptr, int *error)
2417 {
2418 const char *s, *end, *start, *n_end, *true_end;
2419 uchar c;
2420 unsigned long i, j, k;
2421 ulonglong li;
2422 int negative;
2423 ulong cutoff, cutoff2, cutoff3;
2424
2425 s= nptr;
2426 /* If fixed length string */
2427 if (endptr)
2428 {
2429 /* Make sure string length is even */
2430 end= s + ((*endptr - s) / 4) * 4;
2431 while (s < end && !s[0] && !s[1] && !s[2] &&
2432 (s[3] == ' ' || s[3] == '\t'))
2433 s+= 4;
2434 if (s == end)
2435 goto no_conv;
2436 }
2437 else
2438 {
2439 /* We don't support null terminated strings in UCS2 */
2440 goto no_conv;
2441 }
2442
2443 /* Check for a sign. */
2444 negative= 0;
2445 if (!s[0] && !s[1] && !s[2] && s[3] == '-')
2446 {
2447 *error= -1; /* Mark as negative number */
2448 negative= 1;
2449 s+= 4;
2450 if (s == end)
2451 goto no_conv;
2452 cutoff= MAX_NEGATIVE_NUMBER / LFACTOR2;
2453 cutoff2= (MAX_NEGATIVE_NUMBER % LFACTOR2) / 100;
2454 cutoff3= MAX_NEGATIVE_NUMBER % 100;
2455 }
2456 else
2457 {
2458 *error= 0;
2459 if (!s[0] && !s[1] && !s[2] && s[3] == '+')
2460 {
2461 s+= 4;
2462 if (s == end)
2463 goto no_conv;
2464 }
2465 cutoff= ULONGLONG_MAX / LFACTOR2;
2466 cutoff2= ULONGLONG_MAX % LFACTOR2 / 100;
2467 cutoff3= ULONGLONG_MAX % 100;
2468 }
2469
2470 /* Handle case where we have a lot of pre-zero */
2471 if (!s[0] && !s[1] && !s[2] && s[3] == '0')
2472 {
2473 i= 0;
2474 do
2475 {
2476 s+= 4;
2477 if (s == end)
2478 goto end_i; /* Return 0 */
2479 }
2480 while (!s[0] && !s[1] && !s[2] && s[3] == '0');
2481 n_end= s + 4 * INIT_CNT;
2482 }
2483 else
2484 {
2485 /* Read first digit to check that it's a valid number */
2486 if (s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2487 goto no_conv;
2488 i= c;
2489 s+= 4;
2490 n_end= s + 4 * (INIT_CNT-1);
2491 }
2492
2493 /* Handle first 9 digits and store them in i */
2494 if (n_end > end)
2495 n_end= end;
2496 for (; s != n_end ; s+= 4)
2497 {
2498 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2499 goto end_i;
2500 i= i * 10 + c;
2501 }
2502 if (s == end)
2503 goto end_i;
2504
2505 /* Handle next 9 digits and store them in j */
2506 j= 0;
2507 start= s; /* Used to know how much to shift i */
2508 n_end= true_end= s + 4 * INIT_CNT;
2509 if (n_end > end)
2510 n_end= end;
2511 do
2512 {
2513 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2514 goto end_i_and_j;
2515 j= j * 10 + c;
2516 s+= 4;
2517 } while (s != n_end);
2518 if (s == end)
2519 {
2520 if (s != true_end)
2521 goto end_i_and_j;
2522 goto end3;
2523 }
2524 if (s[0] || s[1] || s[2] || (c= (s[3] - '0')) > 9)
2525 goto end3;
2526
2527 /* Handle the next 1 or 2 digits and store them in k */
2528 k=c;
2529 s+= 4;
2530 if (s == end || s[0] || s[1] || s[2] || (c= (s[3]-'0')) > 9)
2531 goto end4;
2532 k= k * 10 + c;
2533 s+= 4;
2534 *endptr= (char*) s;
2535
2536 /* number string should have ended here */
2537 if (s != end && !s[0] && !s[1] && !s[2] && (c= (s[3] - '0')) <= 9)
2538 goto overflow;
2539
2540 /* Check that we didn't get an overflow with the last digit */
2541 if (i > cutoff || (i == cutoff && ((j > cutoff2 || j == cutoff2) &&
2542 k > cutoff3)))
2543 goto overflow;
2544 li= i * LFACTOR2+ (ulonglong) j * 100 + k;
2545 return (longlong) li;
2546
2547 overflow: /* *endptr is set here */
2548 *error= MY_ERRNO_ERANGE;
2549 return negative ? LONGLONG_MIN : (longlong) ULONGLONG_MAX;
2550
2551 end_i:
2552 *endptr= (char*) s;
2553 return (negative ? ((longlong) -(long) i) : (longlong) i);
2554
2555 end_i_and_j:
2556 li= (ulonglong) i * lfactor[(size_t) (s-start) / 4] + j;
2557 *endptr= (char*) s;
2558 return (negative ? -((longlong) li) : (longlong) li);
2559
2560 end3:
2561 li= (ulonglong) i*LFACTOR+ (ulonglong) j;
2562 *endptr= (char*) s;
2563 return (negative ? -((longlong) li) : (longlong) li);
2564
2565 end4:
2566 li= (ulonglong) i*LFACTOR1+ (ulonglong) j * 10 + k;
2567 *endptr= (char*) s;
2568 if (negative)
2569 {
2570 if (li > MAX_NEGATIVE_NUMBER)
2571 goto overflow;
2572 return -((longlong) li);
2573 }
2574 return (longlong) li;
2575
2576 no_conv:
2577 /* There was no number to convert. */
2578 *error= MY_ERRNO_EDOM;
2579 *endptr= (char *) nptr;
2580 return 0;
2581 }
2582
2583
2584 static size_t
my_numchars_utf32(CHARSET_INFO * cs,const char * b,const char * e)2585 my_numchars_utf32(CHARSET_INFO *cs __attribute__((unused)),
2586 const char *b, const char *e)
2587 {
2588 return (size_t) (e - b) / 4;
2589 }
2590
2591
2592 static size_t
my_charpos_utf32(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)2593 my_charpos_utf32(CHARSET_INFO *cs __attribute__((unused)),
2594 const char *b, const char *e, size_t pos)
2595 {
2596 size_t string_length= (size_t) (e - b);
2597 return pos * 4 > string_length ? string_length + 4 : pos * 4;
2598 }
2599
2600
2601 static
my_fill_utf32(CHARSET_INFO * cs,char * s,size_t slen,int fill)2602 void my_fill_utf32(CHARSET_INFO *cs,
2603 char *s, size_t slen, int fill)
2604 {
2605 char buf[10];
2606 #ifdef DBUG_ASSERT_EXISTS
2607 uint buflen;
2608 #endif
2609 char *e= s + slen;
2610
2611 DBUG_ASSERT((slen % 4) == 0);
2612
2613 #ifdef DBUG_ASSERT_EXISTS
2614 buflen=
2615 #endif
2616 cs->cset->wc_mb(cs, (my_wc_t) fill, (uchar*) buf,
2617 (uchar*) buf + sizeof(buf));
2618 DBUG_ASSERT(buflen == 4);
2619 while (s < e)
2620 {
2621 memcpy(s, buf, 4);
2622 s+= 4;
2623 }
2624 }
2625
2626
2627 static int
my_wildcmp_utf32_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2628 my_wildcmp_utf32_ci(CHARSET_INFO *cs,
2629 const char *str, const char *str_end,
2630 const char *wildstr, const char *wildend,
2631 int escape, int w_one, int w_many)
2632 {
2633 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
2634 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2635 escape, w_one, w_many, uni_plane);
2636 }
2637
2638
2639 static int
my_wildcmp_utf32_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)2640 my_wildcmp_utf32_bin(CHARSET_INFO *cs,
2641 const char *str,const char *str_end,
2642 const char *wildstr,const char *wildend,
2643 int escape, int w_one, int w_many)
2644 {
2645 return my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
2646 escape, w_one, w_many, NULL);
2647 }
2648
2649
2650 static size_t
my_scan_utf32(CHARSET_INFO * cs,const char * str,const char * end,int sequence_type)2651 my_scan_utf32(CHARSET_INFO *cs,
2652 const char *str, const char *end, int sequence_type)
2653 {
2654 const char *str0= str;
2655
2656 switch (sequence_type)
2657 {
2658 case MY_SEQ_SPACES:
2659 for ( ; str < end; )
2660 {
2661 my_wc_t wc;
2662 int res= my_utf32_uni(cs, &wc, (uchar*) str, (uchar*) end);
2663 if (res < 0 || wc != ' ')
2664 break;
2665 str+= res;
2666 }
2667 return (size_t) (str - str0);
2668 case MY_SEQ_NONSPACES:
2669 DBUG_ASSERT(0); /* Not implemented */
2670 /* pass through */
2671 default:
2672 return 0;
2673 }
2674 }
2675
2676
2677 static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
2678 {
2679 NULL, /* init */
2680 my_strnncoll_utf32_general_ci,
2681 my_strnncollsp_utf32_general_ci,
2682 my_strnncollsp_nchars_utf32_general_ci,
2683 my_strnxfrm_utf32_general_ci,
2684 my_strnxfrmlen_unicode,
2685 my_like_range_generic,
2686 my_wildcmp_utf32_ci,
2687 my_strcasecmp_mb2_or_mb4,
2688 my_instr_mb,
2689 my_hash_sort_utf32,
2690 my_propagate_simple
2691 };
2692
2693
2694 static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
2695 {
2696 NULL, /* init */
2697 my_strnncoll_utf32_bin,
2698 my_strnncollsp_utf32_bin,
2699 my_strnncollsp_nchars_utf32_bin,
2700 my_strnxfrm_unicode_full_bin,
2701 my_strnxfrmlen_unicode_full_bin,
2702 my_like_range_generic,
2703 my_wildcmp_utf32_bin,
2704 my_strcasecmp_mb2_or_mb4,
2705 my_instr_mb,
2706 my_hash_sort_utf32,
2707 my_propagate_simple
2708 };
2709
2710
2711 static MY_COLLATION_HANDLER my_collation_utf32_general_nopad_ci_handler =
2712 {
2713 NULL, /* init */
2714 my_strnncoll_utf32_general_ci,
2715 my_strnncollsp_utf32_general_nopad_ci,
2716 my_strnncollsp_nchars_utf32_general_nopad_ci,
2717 my_strnxfrm_nopad_utf32_general_ci,
2718 my_strnxfrmlen_unicode,
2719 my_like_range_generic,
2720 my_wildcmp_utf32_ci,
2721 my_strcasecmp_mb2_or_mb4,
2722 my_instr_mb,
2723 my_hash_sort_utf32_nopad,
2724 my_propagate_simple
2725 };
2726
2727
2728 static MY_COLLATION_HANDLER my_collation_utf32_nopad_bin_handler =
2729 {
2730 NULL, /* init */
2731 my_strnncoll_utf32_bin,
2732 my_strnncollsp_utf32_nopad_bin,
2733 my_strnncollsp_nchars_utf32_nopad_bin,
2734 my_strnxfrm_unicode_full_nopad_bin,
2735 my_strnxfrmlen_unicode_full_bin,
2736 my_like_range_generic,
2737 my_wildcmp_utf32_bin,
2738 my_strcasecmp_mb2_or_mb4,
2739 my_instr_mb,
2740 my_hash_sort_utf32_nopad,
2741 my_propagate_simple
2742 };
2743
2744
2745 MY_CHARSET_HANDLER my_charset_utf32_handler=
2746 {
2747 NULL, /* init */
2748 my_numchars_utf32,
2749 my_charpos_utf32,
2750 my_lengthsp_utf32,
2751 my_numcells_mb,
2752 my_utf32_uni,
2753 my_uni_utf32,
2754 my_mb_ctype_mb,
2755 my_caseup_str_mb2_or_mb4,
2756 my_casedn_str_mb2_or_mb4,
2757 my_caseup_utf32,
2758 my_casedn_utf32,
2759 my_snprintf_utf32,
2760 my_l10tostr_mb2_or_mb4,
2761 my_ll10tostr_mb2_or_mb4,
2762 my_fill_utf32,
2763 my_strntol_mb2_or_mb4,
2764 my_strntoul_mb2_or_mb4,
2765 my_strntoll_mb2_or_mb4,
2766 my_strntoull_mb2_or_mb4,
2767 my_strntod_mb2_or_mb4,
2768 my_strtoll10_utf32,
2769 my_strntoull10rnd_mb2_or_mb4,
2770 my_scan_utf32,
2771 my_charlen_utf32,
2772 my_well_formed_char_length_utf32,
2773 my_copy_fix_mb2_or_mb4,
2774 my_uni_utf32,
2775 };
2776
2777
2778 struct charset_info_st my_charset_utf32_general_ci=
2779 {
2780 60,0,0, /* number */
2781 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2782 "utf32", /* cs name */
2783 "utf32_general_ci", /* name */
2784 "UTF-32 Unicode", /* comment */
2785 NULL, /* tailoring */
2786 NULL, /* ctype */
2787 NULL, /* to_lower */
2788 NULL, /* to_upper */
2789 NULL, /* sort_order */
2790 NULL, /* uca */
2791 NULL, /* tab_to_uni */
2792 NULL, /* tab_from_uni */
2793 &my_unicase_default, /* caseinfo */
2794 NULL, /* state_map */
2795 NULL, /* ident_map */
2796 1, /* strxfrm_multiply */
2797 1, /* caseup_multiply */
2798 1, /* casedn_multiply */
2799 4, /* mbminlen */
2800 4, /* mbmaxlen */
2801 0, /* min_sort_char */
2802 0xFFFF, /* max_sort_char */
2803 ' ', /* pad char */
2804 0, /* escape_with_backslash_is_dangerous */
2805 1, /* levels_for_order */
2806 &my_charset_utf32_handler,
2807 &my_collation_utf32_general_ci_handler
2808 };
2809
2810
2811 struct charset_info_st my_charset_utf32_bin=
2812 {
2813 61,0,0, /* number */
2814 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
2815 "utf32", /* cs name */
2816 "utf32_bin", /* name */
2817 "UTF-32 Unicode", /* comment */
2818 NULL, /* tailoring */
2819 NULL, /* ctype */
2820 NULL, /* to_lower */
2821 NULL, /* to_upper */
2822 NULL, /* sort_order */
2823 NULL, /* uca */
2824 NULL, /* tab_to_uni */
2825 NULL, /* tab_from_uni */
2826 &my_unicase_default, /* caseinfo */
2827 NULL, /* state_map */
2828 NULL, /* ident_map */
2829 1, /* strxfrm_multiply */
2830 1, /* caseup_multiply */
2831 1, /* casedn_multiply */
2832 4, /* mbminlen */
2833 4, /* mbmaxlen */
2834 0, /* min_sort_char */
2835 0xFFFF, /* max_sort_char */
2836 ' ', /* pad char */
2837 0, /* escape_with_backslash_is_dangerous */
2838 1, /* levels_for_order */
2839 &my_charset_utf32_handler,
2840 &my_collation_utf32_bin_handler
2841 };
2842
2843
2844 struct charset_info_st my_charset_utf32_general_nopad_ci=
2845 {
2846 MY_NOPAD_ID(60),0,0, /* number */
2847 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
2848 "utf32", /* cs name */
2849 "utf32_general_nopad_ci", /* name */
2850 "UTF-32 Unicode", /* comment */
2851 NULL, /* tailoring */
2852 NULL, /* ctype */
2853 NULL, /* to_lower */
2854 NULL, /* to_upper */
2855 NULL, /* sort_order */
2856 NULL, /* uca */
2857 NULL, /* tab_to_uni */
2858 NULL, /* tab_from_uni */
2859 &my_unicase_default, /* caseinfo */
2860 NULL, /* state_map */
2861 NULL, /* ident_map */
2862 1, /* strxfrm_multiply */
2863 1, /* caseup_multiply */
2864 1, /* casedn_multiply */
2865 4, /* mbminlen */
2866 4, /* mbmaxlen */
2867 0, /* min_sort_char */
2868 0xFFFF, /* max_sort_char */
2869 ' ', /* pad char */
2870 0, /* escape_with_backslash_is_dangerous */
2871 1, /* levels_for_order */
2872 &my_charset_utf32_handler,
2873 &my_collation_utf32_general_nopad_ci_handler
2874 };
2875
2876
2877 struct charset_info_st my_charset_utf32_nopad_bin=
2878 {
2879 MY_NOPAD_ID(61),0,0, /* number */
2880 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|
2881 MY_CS_NOPAD,
2882 "utf32", /* cs name */
2883 "utf32_nopad_bin", /* name */
2884 "UTF-32 Unicode", /* comment */
2885 NULL, /* tailoring */
2886 NULL, /* ctype */
2887 NULL, /* to_lower */
2888 NULL, /* to_upper */
2889 NULL, /* sort_order */
2890 NULL, /* uca */
2891 NULL, /* tab_to_uni */
2892 NULL, /* tab_from_uni */
2893 &my_unicase_default, /* caseinfo */
2894 NULL, /* state_map */
2895 NULL, /* ident_map */
2896 1, /* strxfrm_multiply */
2897 1, /* caseup_multiply */
2898 1, /* casedn_multiply */
2899 4, /* mbminlen */
2900 4, /* mbmaxlen */
2901 0, /* min_sort_char */
2902 0xFFFF, /* max_sort_char */
2903 ' ', /* pad char */
2904 0, /* escape_with_backslash_is_dangerous */
2905 1, /* levels_for_order */
2906 &my_charset_utf32_handler,
2907 &my_collation_utf32_nopad_bin_handler
2908 };
2909
2910
2911 #endif /* HAVE_CHARSET_utf32 */
2912
2913
2914 #ifdef HAVE_CHARSET_ucs2
2915
2916 #include "ctype-ucs2.h"
2917
2918 static const uchar ctype_ucs2[] = {
2919 0,
2920 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
2921 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2922 72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
2923 132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
2924 16,129,129,129,129,129,129, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16,
2926 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2927 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32,
2928 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2929 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2930 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2936 };
2937
2938 static const uchar to_lower_ucs2[] = {
2939 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2940 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2941 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2942 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2943 64, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2944 112,113,114,115,116,117,118,119,120,121,122, 91, 92, 93, 94, 95,
2945 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
2946 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
2947 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2948 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2949 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2950 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2951 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2952 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2953 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2954 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2955 };
2956
2957 static const uchar to_upper_ucs2[] = {
2958 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
2959 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2960 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2961 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
2962 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2963 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
2964 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
2965 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,123,124,125,126,127,
2966 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
2967 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
2968 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
2969 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
2970 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
2971 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
2972 224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
2973 240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
2974 };
2975
2976
2977 /* Definitions for strcoll.inl */
2978 #define IS_MB2_CHAR(x,y) (1)
2979 #define UCS2_CODE(b0,b1) (((uchar) b0) << 8 | ((uchar) b1))
2980
2981
my_weight_mb2_ucs2_general_ci(uchar b0,uchar b1)2982 static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1)
2983 {
2984 my_wc_t wc= UCS2_CODE(b0, b1);
2985 MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
2986 return (int) (page ? page[wc & 0xFF].sort : wc);
2987 }
2988
2989
2990 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci
2991 #define DEFINE_STRNXFRM_UNICODE
2992 #define DEFINE_STRNXFRM_UNICODE_NOPAD
2993 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e)
2994 #define OPTIMIZE_ASCII 0
2995 #define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
2996 #define UNICASE_PAGE0 my_unicase_default_page00
2997 #define UNICASE_PAGES my_unicase_default_pages
2998 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
2999 #define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
3000 #include "strcoll.inl"
3001
3002
3003 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin
3004 #define DEFINE_STRNXFRM_UNICODE_BIN2
3005 #define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e)
3006 #define OPTIMIZE_ASCII 0
3007 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3008 #define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
3009 #include "strcoll.inl"
3010
3011
3012 #define DEFINE_STRNNCOLLSP_NOPAD
3013 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_nopad_ci
3014 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3015 #define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
3016 #include "strcoll.inl"
3017
3018
3019 #define DEFINE_STRNNCOLLSP_NOPAD
3020 #define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_nopad_bin
3021 #define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
3022 #define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
3023 #include "strcoll.inl"
3024
3025
3026 static int
my_charlen_ucs2(CHARSET_INFO * cs,const uchar * s,const uchar * e)3027 my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3028 const uchar *s, const uchar *e)
3029 {
3030 return s + 2 > e ? MY_CS_TOOSMALLN(2) : 2;
3031 }
3032
3033
my_ucs2_uni(CHARSET_INFO * cs,my_wc_t * pwc,const uchar * s,const uchar * e)3034 static int my_ucs2_uni(CHARSET_INFO *cs __attribute__((unused)),
3035 my_wc_t * pwc, const uchar *s, const uchar *e)
3036 {
3037 return my_mb_wc_ucs2_quick(pwc, s, e);
3038 }
3039
my_uni_ucs2(CHARSET_INFO * cs,my_wc_t wc,uchar * r,uchar * e)3040 static int my_uni_ucs2(CHARSET_INFO *cs __attribute__((unused)) ,
3041 my_wc_t wc, uchar *r, uchar *e)
3042 {
3043 if ( r+2 > e )
3044 return MY_CS_TOOSMALL2;
3045
3046 if (wc > 0xFFFF) /* UCS2 does not support characters outside BMP */
3047 return MY_CS_ILUNI;
3048
3049 r[0]= (uchar) (wc >> 8);
3050 r[1]= (uchar) (wc & 0xFF);
3051 return 2;
3052 }
3053
3054
3055 static inline void
my_tolower_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3056 my_tolower_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3057 {
3058 MY_UNICASE_CHARACTER *page;
3059 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3060 *wc= page[*wc & 0xFF].tolower;
3061 }
3062
3063
3064 static inline void
my_toupper_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3065 my_toupper_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3066 {
3067 MY_UNICASE_CHARACTER *page;
3068 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3069 *wc= page[*wc & 0xFF].toupper;
3070 }
3071
3072
3073 static inline void
my_tosort_ucs2(MY_UNICASE_INFO * uni_plane,my_wc_t * wc)3074 my_tosort_ucs2(MY_UNICASE_INFO *uni_plane, my_wc_t *wc)
3075 {
3076 MY_UNICASE_CHARACTER *page;
3077 if ((page= uni_plane->page[(*wc >> 8) & 0xFF]))
3078 *wc= page[*wc & 0xFF].sort;
3079 }
3080
my_caseup_ucs2(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)3081 static size_t my_caseup_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
3082 char *dst, size_t dstlen)
3083 {
3084 my_wc_t wc;
3085 int res;
3086 const char *srcend= src + srclen;
3087 char *dstend= dst + dstlen;
3088 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3089 DBUG_ASSERT(srclen <= dstlen);
3090
3091 while ((src < srcend) &&
3092 (res= my_ucs2_uni(cs, &wc, (uchar *)src, (uchar*) srcend)) > 0)
3093 {
3094 my_toupper_ucs2(uni_plane, &wc);
3095 if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
3096 break;
3097 src+= res;
3098 dst+= res;
3099 }
3100 return srclen;
3101 }
3102
3103
3104 static void
my_hash_sort_ucs2_nopad(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)3105 my_hash_sort_ucs2_nopad(CHARSET_INFO *cs, const uchar *s, size_t slen,
3106 ulong *nr1, ulong *nr2)
3107 {
3108 my_wc_t wc;
3109 int res;
3110 const uchar *e=s+slen;
3111 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3112 register ulong m1= *nr1, m2= *nr2;
3113
3114 while ((s < e) && (res=my_ucs2_uni(cs,&wc, (uchar *)s, (uchar*)e)) >0)
3115 {
3116 my_tosort_ucs2(uni_plane, &wc);
3117 MY_HASH_ADD_16(m1, m2, wc);
3118 s+=res;
3119 }
3120 *nr1= m1;
3121 *nr2= m2;
3122 }
3123
3124
my_hash_sort_ucs2(CHARSET_INFO * cs,const uchar * s,size_t slen,ulong * nr1,ulong * nr2)3125 static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, size_t slen,
3126 ulong *nr1, ulong *nr2)
3127 {
3128 size_t lengthsp= my_lengthsp_mb2(cs, (const char *) s, slen);
3129 my_hash_sort_ucs2_nopad(cs, s, lengthsp, nr1, nr2);
3130 }
3131
my_casedn_ucs2(CHARSET_INFO * cs,const char * src,size_t srclen,char * dst,size_t dstlen)3132 static size_t my_casedn_ucs2(CHARSET_INFO *cs, const char *src, size_t srclen,
3133 char *dst, size_t dstlen)
3134 {
3135 my_wc_t wc;
3136 int res;
3137 const char *srcend= src + srclen;
3138 char *dstend= dst + dstlen;
3139 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3140 DBUG_ASSERT(srclen <= dstlen);
3141
3142 while ((src < srcend) &&
3143 (res= my_ucs2_uni(cs, &wc, (uchar*) src, (uchar*) srcend)) > 0)
3144 {
3145 my_tolower_ucs2(uni_plane, &wc);
3146 if (res != my_uni_ucs2(cs, wc, (uchar*) dst, (uchar*) dstend))
3147 break;
3148 src+= res;
3149 dst+= res;
3150 }
3151 return srclen;
3152 }
3153
3154
3155 static void
my_fill_ucs2(CHARSET_INFO * cs,char * s,size_t l,int fill)3156 my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3157 char *s, size_t l, int fill)
3158 {
3159 DBUG_ASSERT(fill <= 0xFFFF);
3160 #ifdef WAITING_FOR_GCC_VECTORIZATION_BUG_TO_BE_FIXED
3161 /*
3162 This code with int2store() is known to be faster on some processors,
3163 but crashes on other processors due to a possible bug in GCC's
3164 -ftree-vectorization (which is enabled in -O3) in case of
3165 a non-aligned memory. See here for details:
3166 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58039
3167 */
3168 char *last= s + l - 2;
3169 uint16 tmp= (fill >> 8) + ((fill & 0xFF) << 8); /* swap bytes */
3170 DBUG_ASSERT(fill <= 0xFFFF);
3171 for ( ; s <= last; s+= 2)
3172 int2store(s, tmp); /* store little-endian */
3173 #else
3174 for ( ; l >= 2; s[0]= (fill >> 8), s[1]= (fill & 0xFF), s+= 2, l-= 2);
3175 #endif
3176 }
3177
3178
3179 static
my_numchars_ucs2(CHARSET_INFO * cs,const char * b,const char * e)3180 size_t my_numchars_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3181 const char *b, const char *e)
3182 {
3183 return (size_t) (e-b)/2;
3184 }
3185
3186
3187 static
my_charpos_ucs2(CHARSET_INFO * cs,const char * b,const char * e,size_t pos)3188 size_t my_charpos_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3189 const char *b __attribute__((unused)),
3190 const char *e __attribute__((unused)),
3191 size_t pos)
3192 {
3193 size_t string_length= (size_t) (e - b);
3194 return pos > string_length ? string_length + 2 : pos * 2;
3195 }
3196
3197
3198 static size_t
my_well_formed_char_length_ucs2(CHARSET_INFO * cs,const char * b,const char * e,size_t nchars,MY_STRCOPY_STATUS * status)3199 my_well_formed_char_length_ucs2(CHARSET_INFO *cs __attribute__((unused)),
3200 const char *b, const char *e,
3201 size_t nchars, MY_STRCOPY_STATUS *status)
3202 {
3203 size_t length= e - b;
3204 if (nchars * 2 <= length)
3205 {
3206 status->m_well_formed_error_pos= NULL;
3207 status->m_source_end_pos= b + (nchars * 2);
3208 return nchars;
3209 }
3210 if (length % 2)
3211 {
3212 status->m_well_formed_error_pos= status->m_source_end_pos= e - 1;
3213 }
3214 else
3215 {
3216 status->m_well_formed_error_pos= NULL;
3217 status->m_source_end_pos= e;
3218 }
3219 return length / 2;
3220 }
3221
3222
3223 static
my_wildcmp_ucs2_ci(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3224 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
3225 const char *str,const char *str_end,
3226 const char *wildstr,const char *wildend,
3227 int escape, int w_one, int w_many)
3228 {
3229 MY_UNICASE_INFO *uni_plane= cs->caseinfo;
3230 return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3231 escape,w_one,w_many,uni_plane);
3232 }
3233
3234
3235 static
my_wildcmp_ucs2_bin(CHARSET_INFO * cs,const char * str,const char * str_end,const char * wildstr,const char * wildend,int escape,int w_one,int w_many)3236 int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
3237 const char *str,const char *str_end,
3238 const char *wildstr,const char *wildend,
3239 int escape, int w_one, int w_many)
3240 {
3241 return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
3242 escape,w_one,w_many,NULL);
3243 }
3244
3245
3246 static void
my_hash_sort_ucs2_nopad_bin(CHARSET_INFO * cs,const uchar * key,size_t len,ulong * nr1,ulong * nr2)3247 my_hash_sort_ucs2_nopad_bin(CHARSET_INFO *cs __attribute__((unused)),
3248 const uchar *key, size_t len,
3249 ulong *nr1, ulong *nr2)
3250 {
3251 const uchar *end= key + len;
3252 register ulong m1= *nr1, m2= *nr2;
3253 for ( ; key < end ; key++)
3254 {
3255 MY_HASH_ADD(m1, m2, (uint)*key);
3256 }
3257 *nr1= m1;
3258 *nr2= m2;
3259 }
3260
3261
3262 static void
my_hash_sort_ucs2_bin(CHARSET_INFO * cs,const uchar * key,size_t len,ulong * nr1,ulong * nr2)3263 my_hash_sort_ucs2_bin(CHARSET_INFO *cs,
3264 const uchar *key, size_t len, ulong *nr1, ulong *nr2)
3265 {
3266 size_t lengthsp= my_lengthsp_mb2(cs, (const char *) key, len);
3267 my_hash_sort_ucs2_nopad_bin(cs, key, lengthsp, nr1, nr2);
3268 }
3269
3270
3271 static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
3272 {
3273 NULL, /* init */
3274 my_strnncoll_ucs2_general_ci,
3275 my_strnncollsp_ucs2_general_ci,
3276 my_strnncollsp_nchars_ucs2_general_ci,
3277 my_strnxfrm_ucs2_general_ci,
3278 my_strnxfrmlen_unicode,
3279 my_like_range_generic,
3280 my_wildcmp_ucs2_ci,
3281 my_strcasecmp_mb2_or_mb4,
3282 my_instr_mb,
3283 my_hash_sort_ucs2,
3284 my_propagate_simple
3285 };
3286
3287
3288 static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
3289 {
3290 NULL, /* init */
3291 my_strnncoll_ucs2_bin,
3292 my_strnncollsp_ucs2_bin,
3293 my_strnncollsp_nchars_ucs2_bin,
3294 my_strnxfrm_ucs2_bin,
3295 my_strnxfrmlen_unicode,
3296 my_like_range_generic,
3297 my_wildcmp_ucs2_bin,
3298 my_strcasecmp_mb2_or_mb4,
3299 my_instr_mb,
3300 my_hash_sort_ucs2_bin,
3301 my_propagate_simple
3302 };
3303
3304
3305 static MY_COLLATION_HANDLER my_collation_ucs2_general_nopad_ci_handler =
3306 {
3307 NULL, /* init */
3308 my_strnncoll_ucs2_general_ci,
3309 my_strnncollsp_ucs2_general_nopad_ci,
3310 my_strnncollsp_nchars_ucs2_general_nopad_ci,
3311 my_strnxfrm_nopad_ucs2_general_ci,
3312 my_strnxfrmlen_unicode,
3313 my_like_range_generic,
3314 my_wildcmp_ucs2_ci,
3315 my_strcasecmp_mb2_or_mb4,
3316 my_instr_mb,
3317 my_hash_sort_ucs2_nopad,
3318 my_propagate_simple
3319 };
3320
3321
3322 static MY_COLLATION_HANDLER my_collation_ucs2_nopad_bin_handler =
3323 {
3324 NULL, /* init */
3325 my_strnncoll_ucs2_bin,
3326 my_strnncollsp_ucs2_nopad_bin,
3327 my_strnncollsp_nchars_ucs2_nopad_bin,
3328 my_strnxfrm_nopad_ucs2_bin,
3329 my_strnxfrmlen_unicode,
3330 my_like_range_generic,
3331 my_wildcmp_ucs2_bin,
3332 my_strcasecmp_mb2_or_mb4,
3333 my_instr_mb,
3334 my_hash_sort_ucs2_nopad_bin,
3335 my_propagate_simple
3336 };
3337
3338
3339 MY_CHARSET_HANDLER my_charset_ucs2_handler=
3340 {
3341 NULL, /* init */
3342 my_numchars_ucs2,
3343 my_charpos_ucs2,
3344 my_lengthsp_mb2,
3345 my_numcells_mb,
3346 my_ucs2_uni, /* mb_wc */
3347 my_uni_ucs2, /* wc_mb */
3348 my_mb_ctype_mb,
3349 my_caseup_str_mb2_or_mb4,
3350 my_casedn_str_mb2_or_mb4,
3351 my_caseup_ucs2,
3352 my_casedn_ucs2,
3353 my_snprintf_mb2,
3354 my_l10tostr_mb2_or_mb4,
3355 my_ll10tostr_mb2_or_mb4,
3356 my_fill_ucs2,
3357 my_strntol_mb2_or_mb4,
3358 my_strntoul_mb2_or_mb4,
3359 my_strntoll_mb2_or_mb4,
3360 my_strntoull_mb2_or_mb4,
3361 my_strntod_mb2_or_mb4,
3362 my_strtoll10_mb2,
3363 my_strntoull10rnd_mb2_or_mb4,
3364 my_scan_mb2,
3365 my_charlen_ucs2,
3366 my_well_formed_char_length_ucs2,
3367 my_copy_fix_mb2_or_mb4,
3368 my_uni_ucs2,
3369 };
3370
3371
3372 struct charset_info_st my_charset_ucs2_general_ci=
3373 {
3374 35,0,0, /* number */
3375 MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
3376 "ucs2", /* cs name */
3377 "ucs2_general_ci", /* name */
3378 "", /* comment */
3379 NULL, /* tailoring */
3380 ctype_ucs2, /* ctype */
3381 to_lower_ucs2, /* to_lower */
3382 to_upper_ucs2, /* to_upper */
3383 to_upper_ucs2, /* sort_order */
3384 NULL, /* uca */
3385 NULL, /* tab_to_uni */
3386 NULL, /* tab_from_uni */
3387 &my_unicase_default,/* caseinfo */
3388 NULL, /* state_map */
3389 NULL, /* ident_map */
3390 1, /* strxfrm_multiply */
3391 1, /* caseup_multiply */
3392 1, /* casedn_multiply */
3393 2, /* mbminlen */
3394 2, /* mbmaxlen */
3395 0, /* min_sort_char */
3396 0xFFFF, /* max_sort_char */
3397 ' ', /* pad char */
3398 0, /* escape_with_backslash_is_dangerous */
3399 1, /* levels_for_order */
3400 &my_charset_ucs2_handler,
3401 &my_collation_ucs2_general_ci_handler
3402 };
3403
3404
3405 struct charset_info_st my_charset_ucs2_general_mysql500_ci=
3406 {
3407 159, 0, 0, /* number */
3408 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII, /* state */
3409 "ucs2", /* cs name */
3410 "ucs2_general_mysql500_ci", /* name */
3411 "", /* comment */
3412 NULL, /* tailoring */
3413 ctype_ucs2, /* ctype */
3414 to_lower_ucs2, /* to_lower */
3415 to_upper_ucs2, /* to_upper */
3416 to_upper_ucs2, /* sort_order */
3417 NULL, /* uca */
3418 NULL, /* tab_to_uni */
3419 NULL, /* tab_from_uni */
3420 &my_unicase_mysql500, /* caseinfo */
3421 NULL, /* state_map */
3422 NULL, /* ident_map */
3423 1, /* strxfrm_multiply */
3424 1, /* caseup_multiply */
3425 1, /* casedn_multiply */
3426 2, /* mbminlen */
3427 2, /* mbmaxlen */
3428 0, /* min_sort_char */
3429 0xFFFF, /* max_sort_char */
3430 ' ', /* pad char */
3431 0, /* escape_with_backslash_is_dangerous */
3432 1, /* levels_for_order */
3433 &my_charset_ucs2_handler,
3434 &my_collation_ucs2_general_ci_handler
3435 };
3436
3437
3438 struct charset_info_st my_charset_ucs2_bin=
3439 {
3440 90,0,0, /* number */
3441 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
3442 "ucs2", /* cs name */
3443 "ucs2_bin", /* name */
3444 "", /* comment */
3445 NULL, /* tailoring */
3446 ctype_ucs2, /* ctype */
3447 to_lower_ucs2, /* to_lower */
3448 to_upper_ucs2, /* to_upper */
3449 NULL, /* sort_order */
3450 NULL, /* uca */
3451 NULL, /* tab_to_uni */
3452 NULL, /* tab_from_uni */
3453 &my_unicase_default,/* caseinfo */
3454 NULL, /* state_map */
3455 NULL, /* ident_map */
3456 1, /* strxfrm_multiply */
3457 1, /* caseup_multiply */
3458 1, /* casedn_multiply */
3459 2, /* mbminlen */
3460 2, /* mbmaxlen */
3461 0, /* min_sort_char */
3462 0xFFFF, /* max_sort_char */
3463 ' ', /* pad char */
3464 0, /* escape_with_backslash_is_dangerous */
3465 1, /* levels_for_order */
3466 &my_charset_ucs2_handler,
3467 &my_collation_ucs2_bin_handler
3468 };
3469
3470
3471 struct charset_info_st my_charset_ucs2_general_nopad_ci=
3472 {
3473 MY_NOPAD_ID(35),0,0, /* number */
3474 MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
3475 "ucs2", /* cs name */
3476 "ucs2_general_nopad_ci", /* name */
3477 "", /* comment */
3478 NULL, /* tailoring */
3479 ctype_ucs2, /* ctype */
3480 to_lower_ucs2, /* to_lower */
3481 to_upper_ucs2, /* to_upper */
3482 to_upper_ucs2, /* sort_order */
3483 NULL, /* uca */
3484 NULL, /* tab_to_uni */
3485 NULL, /* tab_from_uni */
3486 &my_unicase_default, /* caseinfo */
3487 NULL, /* state_map */
3488 NULL, /* ident_map */
3489 1, /* strxfrm_multiply */
3490 1, /* caseup_multiply */
3491 1, /* casedn_multiply */
3492 2, /* mbminlen */
3493 2, /* mbmaxlen */
3494 0, /* min_sort_char */
3495 0xFFFF, /* max_sort_char */
3496 ' ', /* pad char */
3497 0, /* escape_with_backslash_is_dangerous */
3498 1, /* levels_for_order */
3499 &my_charset_ucs2_handler,
3500 &my_collation_ucs2_general_nopad_ci_handler
3501 };
3502
3503
3504 struct charset_info_st my_charset_ucs2_nopad_bin=
3505 {
3506 MY_NOPAD_ID(90),0,0, /* number */
3507 MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII|MY_CS_NOPAD,
3508 "ucs2", /* cs name */
3509 "ucs2_nopad_bin", /* name */
3510 "", /* comment */
3511 NULL, /* tailoring */
3512 ctype_ucs2, /* ctype */
3513 to_lower_ucs2, /* to_lower */
3514 to_upper_ucs2, /* to_upper */
3515 NULL, /* sort_order */
3516 NULL, /* uca */
3517 NULL, /* tab_to_uni */
3518 NULL, /* tab_from_uni */
3519 &my_unicase_default, /* caseinfo */
3520 NULL, /* state_map */
3521 NULL, /* ident_map */
3522 1, /* strxfrm_multiply */
3523 1, /* caseup_multiply */
3524 1, /* casedn_multiply */
3525 2, /* mbminlen */
3526 2, /* mbmaxlen */
3527 0, /* min_sort_char */
3528 0xFFFF, /* max_sort_char */
3529 ' ', /* pad char */
3530 0, /* escape_with_backslash_is_dangerous */
3531 1, /* levels_for_order */
3532 &my_charset_ucs2_handler,
3533 &my_collation_ucs2_nopad_bin_handler
3534 };
3535
3536 #endif /* HAVE_CHARSET_ucs2 */
3537