1/* 2 Copyright (c) 2015, MariaDB Foundation 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; version 2 of the License. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program; if not, write to the Free Software 15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA 16*/ 17 18#ifndef MY_FUNCTION_NAME 19#error MY_FUNCTION_NAME is not defined 20#endif 21 22/* 23 Define strnncoll() and strnncollsp() by default, 24 unless "#define DEFINE_STRNNCOLL 0" is specified. 25*/ 26#ifndef DEFINE_STRNNCOLL 27#define DEFINE_STRNNCOLL 1 28#endif 29 30 31/* 32 The weight for automatically padded spaces when comparing strings with 33 the PAD SPACE property. 34 Should normally be equal to the weight of a regular space. 35*/ 36#ifndef WEIGHT_PAD_SPACE 37#define WEIGHT_PAD_SPACE (' ') 38#endif 39 40 41/* 42 Weight of an illegal byte, must follow these rules: 43 1. Must be greater than weight of any normal character in the collation. 44 2. Two different bad bytes must have different weights and must be 45 compared in their binary order. 46 47 Depends on mbmaxlen of the character set, as well as how the collation 48 sorts various single-byte and multi-byte character blocks. 49 50 The macro below is the default definition, it is suitable for mbmaxlen=2 51 character sets that sort all multi-byte characters after all single-byte 52 characters: big5, euckr, gb2312, gbk. 53 54 All mbmaxlen>2 character sets must provide their own definitions. 55 All collations that have a more complex order (than just MB1 followed by MB2) 56 must also provide their own definitions (see definitions for 57 cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order). 58*/ 59#ifndef WEIGHT_ILSEQ 60#define WEIGHT_ILSEQ(x) (0xFF00 + (x)) 61#endif 62 63 64#if DEFINE_STRNNCOLL 65 66/** 67 Scan a valid character, or a bad byte, or an auto-padded space 68 from a string and calculate the weight of the scanned sequence. 69 70 @param [OUT] weight - the weight is returned here 71 @param str - the string 72 @param end - the end of the string 73 @return - the number of bytes scanned 74 75 The including source file must define the following macros: 76 IS_MB1_CHAR(b0) - for character sets that have MB1 characters 77 IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance 78 IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters 79 IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters 80 IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters 81 WEIGHT_PAD_SPACE 82 WEIGHT_MB1(b0) - for character sets that have MB1 characters 83 WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters 84 WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters 85 WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters 86 WEIGHT_ILSEQ(x) 87*/ 88static inline uint 89MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) 90{ 91 if (str >= end) 92 { 93 *weight= WEIGHT_PAD_SPACE; 94 return 0; 95 } 96 97#ifdef IS_MB1_CHAR 98 if (IS_MB1_CHAR(*str)) 99 { 100 *weight= WEIGHT_MB1(*str); /* A valid single byte character*/ 101 return 1; 102 } 103#endif 104 105#ifdef IS_MB1_MBHEAD_UNUSED_GAP 106 /* 107 Quickly filter out unused bytes that are neither MB1 nor MBHEAD. 108 E.g. [0x80..0xC1] in utf8. This allows using simplified conditions 109 in IS_MB2_CHAR(), IS_MB3_CHAR(), etc. 110 */ 111 if (IS_MB1_MBHEAD_UNUSED_GAP(*str)) 112 goto bad; 113#endif 114 115#ifdef IS_MB2_CHAR 116 if (str + 2 > end) /* The string ended unexpectedly */ 117 goto bad; /* Treat as a bad byte */ 118 119 if (IS_MB2_CHAR(str[0], str[1])) 120 { 121 *weight= WEIGHT_MB2(str[0], str[1]); 122 return 2; /* A valid two-byte character */ 123 } 124#endif 125 126#ifdef IS_MB3_CHAR 127 if (str + 3 > end) /* Incomplete three-byte character */ 128 goto bad; 129 130 if (IS_MB3_CHAR(str[0], str[1], str[2])) 131 { 132 *weight= WEIGHT_MB3(str[0], str[1], str[2]); 133 return 3; /* A valid three-byte character */ 134 } 135#endif 136 137#ifdef IS_MB4_CHAR 138 if (str + 4 > end) /* Incomplete four-byte character */ 139 goto bad; 140 141 if (IS_MB4_CHAR(str[0], str[1], str[2], str[3])) 142 { 143 *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]); 144 return 4; /* A valid four-byte character */ 145 } 146 147#endif 148 149bad: 150 *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ 151 return 1; 152} 153 154 155/** 156 Compare two strings according to the collation, 157 without handling the PAD SPACE property. 158 159 Note, cs->coll->strnncoll() is usually used to compare identifiers. 160 Perhaps we should eventually (in 10.2?) create a new collation 161 my_charset_utf8_general_ci_no_pad and have only one comparison function 162 in MY_COLLATION_HANDLER. 163 164 @param cs - the character set and collation 165 @param a - the left string 166 @param a_length - the length of the left string 167 @param b - the right string 168 @param b_length - the length of the right string 169 @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a" 170 @return - the comparison result 171*/ 172static int 173MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)), 174 const uchar *a, size_t a_length, 175 const uchar *b, size_t b_length, 176 my_bool b_is_prefix) 177{ 178 const uchar *a_end= a + a_length; 179 const uchar *b_end= b + b_length; 180 for ( ; ; ) 181 { 182 int a_weight, b_weight, res; 183 uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); 184 uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); 185 /* 186 a_wlen b_wlen Comment 187 ------ ------ ------- 188 0 0 Strings ended simultaneously, "a" and "b" are equal. 189 0 >0 "a" is a prefix of "b", so "a" is smaller. 190 >0 0 "b" is a prefix of "a", check b_is_prefix. 191 >0 >0 Two weights were scanned, check weight difference. 192 */ 193 if (!a_wlen) 194 return b_wlen ? -b_weight : 0; 195 196 if (!b_wlen) 197 return b_is_prefix ? 0 : a_weight; 198 199 if ((res= (a_weight - b_weight))) 200 return res; 201 /* 202 None of the strings has ended yet. 203 */ 204 DBUG_ASSERT(a < a_end); 205 DBUG_ASSERT(b < b_end); 206 a+= a_wlen; 207 b+= b_wlen; 208 } 209 DBUG_ASSERT(0); 210 return 0; 211} 212 213 214#ifdef DEFINE_STRNNCOLLSP_NOPAD 215 216/** 217 Compare two strings according to the collation, with NO PAD handling. 218 219 @param cs - the character set and collation 220 @param a - the left string 221 @param a_length - the length of the left string 222 @param b - the right string 223 @param b_length - the length of the right string 224 @return - the comparison result 225*/ 226static int 227MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), 228 const uchar *a, size_t a_length, 229 const uchar *b, size_t b_length) 230{ 231 return MY_FUNCTION_NAME(strnncoll)(cs, a, a_length, b, b_length, FALSE); 232} 233#else 234/** 235 Compare two strings according to the collation, with PAD SPACE handling. 236 237 @param cs - the character set and collation 238 @param a - the left string 239 @param a_length - the length of the left string 240 @param b - the right string 241 @param b_length - the length of the right string 242 @return - the comparison result 243*/ 244static int 245MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), 246 const uchar *a, size_t a_length, 247 const uchar *b, size_t b_length) 248{ 249 const uchar *a_end= a + a_length; 250 const uchar *b_end= b + b_length; 251 for ( ; ; ) 252 { 253 int a_weight, b_weight, res; 254 uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); 255 uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); 256 if ((res= (a_weight - b_weight))) 257 { 258 /* 259 Got two different weights. Each weight can be generated by either of: 260 - a real character 261 - a bad byte sequence or an incomplete byte sequence 262 - an auto-generated trailing space (PAD SPACE) 263 It does not matter how exactly each weight was generated. 264 Just return the weight difference. 265 */ 266 return res; 267 } 268 if (!a_wlen && !b_wlen) 269 { 270 /* 271 Got two auto-generated trailing spaces, i.e. 272 both strings have now ended, so they are equal. 273 */ 274 DBUG_ASSERT(a == a_end); 275 DBUG_ASSERT(b == b_end); 276 return 0; 277 } 278 /* 279 At least one of the strings has not ended yet, continue comparison. 280 */ 281 DBUG_ASSERT(a < a_end || b < b_end); 282 a+= a_wlen; 283 b+= b_wlen; 284 } 285 DBUG_ASSERT(0); 286 return 0; 287} 288#endif 289 290 291/** 292 Compare two strings according to the collation, 293 with trailing space padding or trimming, according to "nchars". 294 295 @param cs - the character set and collation 296 @param a - the left string 297 @param a_length - the length of the left string 298 @param b - the right string 299 @param b_length - the length of the right string 300 @param nchars - compare this amount of characters only 301 @return - the comparison result 302*/ 303static int 304MY_FUNCTION_NAME(strnncollsp_nchars)(CHARSET_INFO *cs __attribute__((unused)), 305 const uchar *a, size_t a_length, 306 const uchar *b, size_t b_length, 307 size_t nchars) 308{ 309 const uchar *a_end= a + a_length; 310 const uchar *b_end= b + b_length; 311 for ( ; nchars ; nchars--) 312 { 313 int a_weight, b_weight, res; 314 uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); 315 uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); 316 317 if ((res= (a_weight - b_weight))) 318 { 319 /* Got two different weights. See comments in strnncollsp above. */ 320 return res; 321 } 322 if (!a_wlen && !b_wlen) 323 { 324 /* Got two auto-generated trailing spaces. */ 325 DBUG_ASSERT(a == a_end); 326 DBUG_ASSERT(b == b_end); 327 return 0; 328 } 329 /* 330 At least one of the strings has not ended yet, continue comparison. 331 */ 332 DBUG_ASSERT(a < a_end || b < b_end); 333 a+= a_wlen; 334 b+= b_wlen; 335 } 336 return 0; 337} 338 339 340#endif /* DEFINE_STRNNCOLL */ 341 342 343#ifdef DEFINE_STRNXFRM 344#ifndef WEIGHT_MB2_FRM 345#define WEIGHT_MB2_FRM(x,y) WEIGHT_MB2(x,y) 346#endif 347 348static size_t 349MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, 350 uchar *dst, size_t dstlen, uint nweights, 351 const uchar *src, size_t srclen, uint flags) 352{ 353 uchar *d0= dst; 354 uchar *de= dst + dstlen; 355 const uchar *se= src + srclen; 356 const uchar *sort_order= cs->sort_order; 357 358 for (; dst < de && src < se && nweights; nweights--) 359 { 360 if (my_charlen(cs, (const char *) src, (const char *) se) > 1) 361 { 362 /* 363 Note, it is safe not to check (src < se) 364 in the code below, because my_charlen() would 365 not return 2 if src was too short 366 */ 367 uint16 e= WEIGHT_MB2_FRM(src[0], src[1]); 368 *dst++= (uchar) (e >> 8); 369 if (dst < de) 370 *dst++= (uchar) (e & 0xFF); 371 src+= 2; 372 } 373 else 374 *dst++= sort_order ? sort_order[*src++] : *src++; 375 } 376#ifdef DEFINE_STRNNCOLLSP_NOPAD 377 return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de, 378 nweights, flags, 0); 379#else 380 return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0); 381#endif 382} 383#endif /* DEFINE_STRNXFRM */ 384 385 386#if defined(DEFINE_STRNXFRM_UNICODE) || defined(DEFINE_STRNXFRM_UNICODE_NOPAD) 387 388/* 389 Store sorting weights using 2 bytes per character. 390 391 This function is shared between 392 - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin 393 which support BMP only (U+0000..U+FFFF). 394 - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, 395 which map all supplementary characters to weight 0xFFFD. 396*/ 397 398#ifndef MY_MB_WC 399#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE 400#endif 401 402#ifndef OPTIMIZE_ASCII 403#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE 404#endif 405 406#ifndef UNICASE_MAXCHAR 407#error UNICASE_MAXCHAR must be defined for DEFINE_STRNXFRM_UNICODE 408#endif 409 410#ifndef UNICASE_PAGE0 411#error UNICASE_PAGE0 must be defined for DEFINE_STRNXFRM_UNICODE 412#endif 413 414#ifndef UNICASE_PAGES 415#error UNICASE_PAGES must be defined for DEFINE_STRNXFRM_UNICODE 416#endif 417 418 419static size_t 420MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs __attribute__((unused)), 421 uchar *dst, uchar *de, 422 uint *nweights, 423 const uchar *src, const uchar *se) 424{ 425 my_wc_t UNINIT_VAR(wc); 426 uchar *dst0= dst; 427 428 DBUG_ASSERT(src || !se); 429 DBUG_ASSERT((cs->state & MY_CS_LOWER_SORT) == 0); 430 DBUG_ASSERT(0x7F <= UNICASE_MAXCHAR); 431 432 for (; dst < de && *nweights; (*nweights)--) 433 { 434 int res; 435#if OPTIMIZE_ASCII 436 if (src >= se) 437 break; 438 if (src[0] <= 0x7F) 439 { 440 wc= UNICASE_PAGE0[*src++].sort; 441 PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); 442 continue; 443 } 444#endif 445 if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0) 446 break; 447 src+= res; 448 if (wc <= UNICASE_MAXCHAR) 449 { 450 MY_UNICASE_CHARACTER *page; 451 if ((page= UNICASE_PAGES[wc >> 8])) 452 wc= page[wc & 0xFF].sort; 453 } 454 else 455 wc= MY_CS_REPLACEMENT_CHARACTER; 456 PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); 457 } 458 return dst - dst0; 459} 460 461 462static size_t 463MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, 464 uchar *dst, size_t dstlen, uint nweights, 465 const uchar *src, size_t srclen, uint flags) 466{ 467 uchar *dst0= dst; 468 uchar *de= dst + dstlen; 469 dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, 470 src, src + srclen); 471 DBUG_ASSERT(dst <= de); /* Safety */ 472 473 if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) 474 dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); 475 476 my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); 477 478 if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) 479 dst+= my_strxfrm_pad_unicode(dst, de); 480 return dst - dst0; 481} 482 483 484#ifdef DEFINE_STRNXFRM_UNICODE_NOPAD 485static size_t 486MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, 487 uchar *dst, size_t dstlen, 488 uint nweights, 489 const uchar *src, size_t srclen, uint flags) 490{ 491 uchar *dst0= dst; 492 uchar *de= dst + dstlen; 493 dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, 494 src, src + srclen); 495 DBUG_ASSERT(dst <= de); /* Safety */ 496 497 if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) 498 { 499 size_t len= de - dst; 500 set_if_smaller(len, nweights * 2); 501 memset(dst, 0x00, len); 502 dst+= len; 503 } 504 505 my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); 506 507 if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) 508 { 509 memset(dst, 0x00, de - dst); 510 dst= de; 511 } 512 return dst - dst0; 513} 514#endif 515 516#endif /* DEFINE_STRNXFRM_UNICODE || DEFINE_STRNXFRM_UNICODE_NOPAD */ 517 518 519 520#ifdef DEFINE_STRNXFRM_UNICODE_BIN2 521 522/* 523 Store sorting weights using 2 bytes per character. 524 525 These functions are shared between 526 - utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin 527 which support BMP only (U+0000..U+FFFF). 528 - utf8mb4_general_ci, utf16_general_ci, utf32_general_ci, 529 which map all supplementary characters to weight 0xFFFD. 530*/ 531 532#ifndef MY_MB_WC 533#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE_BIN2 534#endif 535 536#ifndef OPTIMIZE_ASCII 537#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE_BIN2 538#endif 539 540 541static size_t 542MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs __attribute__((unused)), 543 uchar *dst, uchar *de, 544 uint *nweights, 545 const uchar *src, 546 const uchar *se) 547{ 548 my_wc_t UNINIT_VAR(wc); 549 uchar *dst0= dst; 550 551 DBUG_ASSERT(src || !se); 552 553 for (; dst < de && *nweights; (*nweights)--) 554 { 555 int res; 556#if OPTIMIZE_ASCII 557 if (src >= se) 558 break; 559 if (src[0] <= 0x7F) 560 { 561 wc= *src++; 562 PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); 563 continue; 564 } 565#endif 566 if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0) 567 break; 568 src+= res; 569 if (wc > 0xFFFF) 570 wc= MY_CS_REPLACEMENT_CHARACTER; 571 PUT_WC_BE2_HAVE_1BYTE(dst, de, wc); 572 } 573 return dst - dst0; 574} 575 576 577static size_t 578MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, 579 uchar *dst, size_t dstlen, uint nweights, 580 const uchar *src, size_t srclen, uint flags) 581{ 582 uchar *dst0= dst; 583 uchar *de= dst + dstlen; 584 dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, 585 src, src + srclen); 586 DBUG_ASSERT(dst <= de); /* Safety */ 587 588 if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) 589 dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights); 590 591 my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); 592 593 if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) 594 dst+= my_strxfrm_pad_unicode(dst, de); 595 return dst - dst0; 596} 597 598 599static size_t 600MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, 601 uchar *dst, size_t dstlen, uint nweights, 602 const uchar *src, size_t srclen, uint flags) 603{ 604 uchar *dst0= dst; 605 uchar *de= dst + dstlen; 606 dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights, 607 src, src + srclen); 608 DBUG_ASSERT(dst <= de); /* Safety */ 609 610 if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE)) 611 { 612 size_t len= de - dst; 613 set_if_smaller(len, nweights * 2); 614 memset(dst, 0x00, len); 615 dst+= len; 616 } 617 618 my_strxfrm_desc_and_reverse(dst0, dst, flags, 0); 619 620 if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de) 621 { 622 memset(dst, 0x00, de - dst); 623 dst= de; 624 } 625 return dst - dst0; 626} 627 628#endif /* DEFINE_STRNXFRM_UNICODE_BIN2 */ 629 630 631/* 632 We usually include this file at least two times from the same source file, 633 for the _ci and the _bin collations. Prepare for the second inclusion. 634*/ 635#undef MY_FUNCTION_NAME 636#undef MY_MB_WC 637#undef OPTIMIZE_ASCII 638#undef UNICASE_MAXCHAR 639#undef UNICASE_PAGE0 640#undef UNICASE_PAGES 641#undef WEIGHT_ILSEQ 642#undef WEIGHT_MB1 643#undef WEIGHT_MB2 644#undef WEIGHT_MB3 645#undef WEIGHT_MB4 646#undef WEIGHT_PAD_SPACE 647#undef WEIGHT_MB2_FRM 648#undef DEFINE_STRNXFRM 649#undef DEFINE_STRNXFRM_UNICODE 650#undef DEFINE_STRNXFRM_UNICODE_NOPAD 651#undef DEFINE_STRNXFRM_UNICODE_BIN2 652#undef DEFINE_STRNNCOLL 653#undef DEFINE_STRNNCOLLSP_NOPAD 654