1 /* -*- c-basic-offset: 2 -*- */ 2 /* 3 Copyright(C) 2013-2015 Kouhei Sutou <kou@clear-code.com> 4 5 This library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Library General Public 7 License as published by the Free Software Foundation; version 2 8 of the License. 9 10 This library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this library; if not, write to the Free 17 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, 18 MA 02110-1335 USA 19 */ 20 21 #ifdef HAVE_CONFIG_H 22 # include <config.h> 23 #endif 24 25 #ifdef GROONGA_NORMALIZER_MYSQL_EMBED 26 # define GRN_PLUGIN_FUNCTION_TAG normalizers_mysql 27 #endif 28 29 #include <groonga/normalizer.h> 30 #include <groonga/nfkc.h> 31 32 #include <string.h> 33 #include <stdio.h> 34 35 #include "mysql_general_ci_table.h" 36 #include "mysql_unicode_ci_table.h" 37 #include "mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h" 38 #include "mysql_unicode_520_ci_table.h" 39 #include "mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h" 40 41 #ifdef __GNUC__ 42 # define GNUC_UNUSED __attribute__((__unused__)) 43 #else 44 # define GNUC_UNUSED 45 #endif 46 47 #ifdef _MSC_VER 48 # define inline _inline 49 # define snprintf _snprintf 50 #endif 51 52 #define SNIPPET_BUFFER_SIZE 256 53 54 typedef grn_bool (*normalizer_func)(grn_ctx *ctx, 55 const char *utf8, 56 int *character_length, 57 int rest_length, 58 uint32_t **normalize_table, 59 char *normalized, 60 unsigned int *normalized_characer_length, 61 unsigned int *normalized_length_in_bytes, 62 unsigned int *normalized_n_characters); 63 64 static inline unsigned int 65 unichar_to_utf8(uint32_t unichar, char *output) 66 { 67 unsigned int n_bytes; 68 69 if (unichar < 0x80) { 70 output[0] = unichar; 71 n_bytes = 1; 72 } else if (unichar < 0x0800) { 73 output[0] = ((unichar >> 6) & 0x1f) | 0xc0; 74 output[1] = (unichar & 0x3f) | 0x80; 75 n_bytes = 2; 76 } else if (unichar < 0x10000) { 77 output[0] = (unichar >> 12) | 0xe0; 78 output[1] = ((unichar >> 6) & 0x3f) | 0x80; 79 output[2] = (unichar & 0x3f) | 0x80; 80 n_bytes = 3; 81 } else if (unichar < 0x200000) { 82 output[0] = (unichar >> 18) | 0xf0; 83 output[1] = ((unichar >> 12) & 0x3f) | 0x80; 84 output[2] = ((unichar >> 6) & 0x3f) | 0x80; 85 output[3] = (unichar & 0x3f) | 0x80; 86 n_bytes = 4; 87 } else if (unichar < 0x4000000) { 88 output[0] = (unichar >> 24) | 0xf8; 89 output[1] = ((unichar >> 18) & 0x3f) | 0x80; 90 output[2] = ((unichar >> 12) & 0x3f) | 0x80; 91 output[3] = ((unichar >> 6) & 0x3f) | 0x80; 92 output[4] = (unichar & 0x3f) | 0x80; 93 n_bytes = 5; 94 } else { 95 output[0] = (unichar >> 30) | 0xfc; 96 output[1] = ((unichar >> 24) & 0x3f) | 0x80; 97 output[2] = ((unichar >> 18) & 0x3f) | 0x80; 98 output[3] = ((unichar >> 12) & 0x3f) | 0x80; 99 output[4] = ((unichar >> 6) & 0x3f) | 0x80; 100 output[5] = (unichar & 0x3f) | 0x80; 101 n_bytes = 6; 102 } 103 104 return n_bytes; 105 } 106 107 static inline uint32_t 108 utf8_to_unichar(const char *utf8, int byte_size) 109 { 110 uint32_t unichar; 111 const unsigned char *bytes = (const unsigned char *)utf8; 112 113 switch (byte_size) { 114 case 1 : 115 unichar = bytes[0] & 0x7f; 116 break; 117 case 2 : 118 unichar = ((bytes[0] & 0x1f) << 6) + (bytes[1] & 0x3f); 119 break; 120 case 3 : 121 unichar = 122 ((bytes[0] & 0x0f) << 12) + 123 ((bytes[1] & 0x3f) << 6) + 124 ((bytes[2] & 0x3f)); 125 break; 126 case 4 : 127 unichar = 128 ((bytes[0] & 0x07) << 18) + 129 ((bytes[1] & 0x3f) << 12) + 130 ((bytes[2] & 0x3f) << 6) + 131 ((bytes[3] & 0x3f)); 132 break; 133 case 5 : 134 unichar = 135 ((bytes[0] & 0x03) << 24) + 136 ((bytes[1] & 0x3f) << 18) + 137 ((bytes[2] & 0x3f) << 12) + 138 ((bytes[3] & 0x3f) << 6) + 139 ((bytes[4] & 0x3f)); 140 break; 141 case 6 : 142 unichar = 143 ((bytes[0] & 0x01) << 30) + 144 ((bytes[1] & 0x3f) << 24) + 145 ((bytes[2] & 0x3f) << 18) + 146 ((bytes[3] & 0x3f) << 12) + 147 ((bytes[4] & 0x3f) << 6) + 148 ((bytes[5] & 0x3f)); 149 break; 150 default : 151 unichar = 0; 152 break; 153 } 154 155 return unichar; 156 } 157 158 static inline void 159 decompose_character(const char *rest, int character_length, 160 size_t *page, uint32_t *low_code) 161 { 162 switch (character_length) { 163 case 1 : 164 *page = 0x00; 165 *low_code = rest[0] & 0x7f; 166 break; 167 case 2 : 168 *page = (rest[0] & 0x1c) >> 2; 169 *low_code = ((rest[0] & 0x03) << 6) + (rest[1] & 0x3f); 170 break; 171 case 3 : 172 *page = ((rest[0] & 0x0f) << 4) + ((rest[1] & 0x3c) >> 2); 173 *low_code = ((rest[1] & 0x03) << 6) + (rest[2] & 0x3f); 174 break; 175 case 4 : 176 *page = 177 ((rest[0] & 0x07) << 10) + 178 ((rest[1] & 0x3f) << 4) + 179 ((rest[2] & 0x3c) >> 2); 180 *low_code = ((rest[2] & 0x03) << 6) + (rest[3] & 0x3f); 181 break; 182 case 5 : 183 *page = 184 ((rest[0] & 0x03) << 16) + 185 ((rest[1] & 0x3f) << 10) + 186 ((rest[2] & 0x3f) << 4) + 187 ((rest[3] & 0x3c) >> 2); 188 *low_code = ((rest[3] & 0x03) << 6) + (rest[4] & 0x3f); 189 break; 190 case 6 : 191 *page = 192 ((rest[0] & 0x01) << 22) + 193 ((rest[1] & 0x3f) << 16) + 194 ((rest[2] & 0x3f) << 10) + 195 ((rest[3] & 0x3f) << 4) + 196 ((rest[4] & 0x3c) >> 2); 197 *low_code = ((rest[4] & 0x03) << 6) + (rest[5] & 0x3f); 198 break; 199 default : 200 *page = (size_t)-1; 201 *low_code = 0x00; 202 break; 203 } 204 } 205 206 static inline void 207 normalize_character(const char *utf8, int character_length, 208 uint32_t **normalize_table, 209 size_t normalize_table_size, 210 char *normalized, 211 unsigned int *normalized_character_length, 212 unsigned int *normalized_length_in_bytes, 213 unsigned int *normalized_n_characters) 214 { 215 size_t page; 216 uint32_t low_code; 217 decompose_character(utf8, character_length, &page, &low_code); 218 if (page < normalize_table_size && normalize_table[page]) { 219 uint32_t normalized_code; 220 unsigned int n_bytes; 221 normalized_code = normalize_table[page][low_code]; 222 if (normalized_code == 0x00000) { 223 *normalized_character_length = 0; 224 } else { 225 n_bytes = unichar_to_utf8(normalized_code, 226 normalized + *normalized_length_in_bytes); 227 *normalized_character_length = n_bytes; 228 *normalized_length_in_bytes += n_bytes; 229 (*normalized_n_characters)++; 230 } 231 } else { 232 int i; 233 for (i = 0; i < character_length; i++) { 234 normalized[*normalized_length_in_bytes + i] = utf8[i]; 235 } 236 *normalized_character_length = character_length; 237 *normalized_length_in_bytes += character_length; 238 (*normalized_n_characters)++; 239 } 240 } 241 242 static void 243 sized_buffer_append(char *buffer, 244 unsigned int buffer_length, 245 unsigned int *buffer_rest_length, 246 const char *string) 247 { 248 size_t string_length; 249 250 string_length = strlen(string); 251 if (string_length >= *buffer_rest_length) { 252 return; 253 } 254 255 strncat(buffer, string, buffer_length); 256 *buffer_rest_length -= string_length; 257 } 258 259 static void 260 sized_buffer_dump_string(char *buffer, 261 unsigned int buffer_length, 262 unsigned int *buffer_rest_length, 263 const char *string, unsigned int string_length) 264 { 265 const unsigned char *bytes; 266 unsigned int i; 267 268 bytes = (const unsigned char *)string; 269 for (i = 0; i < string_length; i++) { 270 unsigned char byte = bytes[i]; 271 #define FORMATTED_BYTE_BUFFER_SIZE 5 /* "0xFF\0" */ 272 char formatted_byte[FORMATTED_BYTE_BUFFER_SIZE]; 273 if (i > 0) { 274 sized_buffer_append(buffer, buffer_length, buffer_rest_length, 275 " "); 276 } 277 if (byte == 0) { 278 strncpy(formatted_byte, "0x00", FORMATTED_BYTE_BUFFER_SIZE); 279 } else { 280 snprintf(formatted_byte, FORMATTED_BYTE_BUFFER_SIZE, "%#04x", byte); 281 } 282 sized_buffer_append(buffer, buffer_length, buffer_rest_length, 283 formatted_byte); 284 #undef FORMATTED_BYTE_BUFFER_SIZE 285 } 286 } 287 288 static const char * 289 snippet(const char *string, unsigned int length, unsigned int target_byte, 290 char *buffer, unsigned int buffer_length) 291 { 292 const char *elision_mark = "..."; 293 unsigned int max_window_length = 12; 294 unsigned int window_length; 295 unsigned int buffer_rest_length = buffer_length - 1; 296 297 buffer[0] = '\0'; 298 299 if (target_byte > 0) { 300 sized_buffer_append(buffer, buffer_length, &buffer_rest_length, 301 elision_mark); 302 } 303 304 sized_buffer_append(buffer, buffer_length, &buffer_rest_length, "<"); 305 if (target_byte + max_window_length > length) { 306 window_length = length - target_byte; 307 } else { 308 window_length = max_window_length; 309 } 310 sized_buffer_dump_string(buffer, buffer_length, &buffer_rest_length, 311 string + target_byte, window_length); 312 sized_buffer_append(buffer, buffer_length, &buffer_rest_length, 313 ">"); 314 315 if (target_byte + window_length < length) { 316 sized_buffer_append(buffer, buffer_length, &buffer_rest_length, 317 elision_mark); 318 } 319 320 return buffer; 321 } 322 323 static void 324 normalize(grn_ctx *ctx, grn_obj *string, 325 const char *normalizer_type_label, 326 uint32_t **normalize_table, 327 size_t normalize_table_size, 328 normalizer_func custom_normalizer) 329 { 330 const char *original, *rest; 331 unsigned int original_length_in_bytes, rest_length; 332 char *normalized; 333 unsigned int normalized_length_in_bytes = 0; 334 unsigned int normalized_n_characters = 0; 335 unsigned char *types = NULL; 336 unsigned char *current_type = NULL; 337 short *checks = NULL; 338 short *current_check = NULL; 339 grn_encoding encoding; 340 int flags; 341 grn_bool remove_blank_p; 342 343 encoding = grn_string_get_encoding(ctx, string); 344 flags = grn_string_get_flags(ctx, string); 345 remove_blank_p = flags & GRN_STRING_REMOVE_BLANK; 346 grn_string_get_original(ctx, string, &original, &original_length_in_bytes); 347 { 348 unsigned int max_normalized_length_in_bytes = 349 original_length_in_bytes + 1; 350 normalized = GRN_PLUGIN_MALLOC(ctx, max_normalized_length_in_bytes); 351 } 352 if (flags & GRN_STRING_WITH_TYPES) { 353 unsigned int max_normalized_n_characters = original_length_in_bytes + 1; 354 types = GRN_PLUGIN_MALLOC(ctx, max_normalized_n_characters); 355 current_type = types; 356 } 357 if (flags & GRN_STRING_WITH_CHECKS) { 358 unsigned int max_checks_size = sizeof(short) * original_length_in_bytes + 1; 359 checks = GRN_PLUGIN_MALLOC(ctx, max_checks_size); 360 current_check = checks; 361 current_check[0] = 0; 362 } 363 rest = original; 364 rest_length = original_length_in_bytes; 365 while (rest_length > 0) { 366 int character_length; 367 grn_bool custom_normalized = GRN_FALSE; 368 unsigned int normalized_character_length; 369 unsigned int previous_normalized_length_in_bytes = 370 normalized_length_in_bytes; 371 unsigned int previous_normalized_n_characters = 372 normalized_n_characters; 373 374 character_length = grn_plugin_charlen(ctx, rest, rest_length, encoding); 375 if (character_length == 0) { 376 break; 377 } 378 379 if (custom_normalizer) { 380 custom_normalized = custom_normalizer(ctx, 381 rest, 382 &character_length, 383 rest_length - character_length, 384 normalize_table, 385 normalized, 386 &normalized_character_length, 387 &normalized_length_in_bytes, 388 &normalized_n_characters); 389 } 390 if (!custom_normalized) { 391 normalize_character(rest, character_length, 392 normalize_table, normalize_table_size, 393 normalized, 394 &normalized_character_length, 395 &normalized_length_in_bytes, 396 &normalized_n_characters); 397 } 398 399 if (remove_blank_p && 400 normalized_character_length == 1 && 401 normalized[previous_normalized_length_in_bytes] == ' ') { 402 if (current_type > types) { 403 current_type[-1] |= GRN_CHAR_BLANK; 404 } 405 if (current_check) { 406 current_check[0]++; 407 } 408 normalized_length_in_bytes = previous_normalized_length_in_bytes; 409 normalized_n_characters = previous_normalized_n_characters; 410 } else { 411 if (current_type && normalized_character_length > 0) { 412 char *current_normalized; 413 current_normalized = 414 normalized + normalized_length_in_bytes - normalized_character_length; 415 current_type[0] = 416 grn_nfkc_char_type((unsigned char *)current_normalized); 417 current_type++; 418 } 419 if (current_check) { 420 current_check[0] += character_length; 421 if (normalized_character_length > 0) { 422 unsigned int i; 423 current_check++; 424 for (i = 1; i < normalized_character_length; i++) { 425 current_check[0] = 0; 426 current_check++; 427 } 428 current_check[0] = 0; 429 } 430 } 431 } 432 433 rest += character_length; 434 rest_length -= character_length; 435 } 436 if (current_type) { 437 current_type[0] = GRN_CHAR_NULL; 438 } 439 normalized[normalized_length_in_bytes] = '\0'; 440 441 if (rest_length > 0) { 442 char buffer[SNIPPET_BUFFER_SIZE]; 443 GRN_PLUGIN_LOG(ctx, GRN_LOG_DEBUG, 444 "[normalizer][%s] failed to normalize at %u byte: %s", 445 normalizer_type_label, 446 original_length_in_bytes - rest_length, 447 snippet(original, 448 original_length_in_bytes, 449 original_length_in_bytes - rest_length, 450 buffer, 451 SNIPPET_BUFFER_SIZE)); 452 } 453 grn_string_set_normalized(ctx, 454 string, 455 normalized, 456 normalized_length_in_bytes, 457 normalized_n_characters); 458 grn_string_set_types(ctx, string, types); 459 grn_string_set_checks(ctx, string, checks); 460 } 461 462 static grn_obj * 463 mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx, 464 GNUC_UNUSED int nargs, 465 grn_obj **args, 466 GNUC_UNUSED grn_user_data *user_data) 467 { 468 grn_obj *string = args[0]; 469 grn_encoding encoding; 470 const char *normalizer_type_label = "mysql-general-ci"; 471 472 encoding = grn_string_get_encoding(ctx, string); 473 if (encoding != GRN_ENC_UTF8) { 474 GRN_PLUGIN_ERROR(ctx, 475 GRN_FUNCTION_NOT_IMPLEMENTED, 476 "[normalizer][%s] " 477 "UTF-8 encoding is only supported: %s", 478 normalizer_type_label, 479 grn_encoding_to_string(encoding)); 480 return NULL; 481 } 482 normalize(ctx, string, normalizer_type_label, 483 general_ci_table, sizeof(general_ci_table) / sizeof(uint32_t *), 484 NULL); 485 return NULL; 486 } 487 488 static grn_obj * 489 mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx, 490 GNUC_UNUSED int nargs, 491 grn_obj **args, 492 GNUC_UNUSED grn_user_data *user_data) 493 { 494 grn_obj *string = args[0]; 495 grn_encoding encoding; 496 const char *normalizer_type_label = "mysql-unicode-ci"; 497 498 encoding = grn_string_get_encoding(ctx, string); 499 if (encoding != GRN_ENC_UTF8) { 500 GRN_PLUGIN_ERROR(ctx, 501 GRN_FUNCTION_NOT_IMPLEMENTED, 502 "[normalizer][%s] " 503 "UTF-8 encoding is only supported: %s", 504 normalizer_type_label, 505 grn_encoding_to_string(encoding)); 506 return NULL; 507 } 508 normalize(ctx, string, normalizer_type_label, 509 unicode_ci_table, sizeof(unicode_ci_table) / sizeof(uint32_t *), 510 NULL); 511 return NULL; 512 } 513 514 #define HALFWIDTH_KATAKANA_LETTER_KA 0xff76 515 #define HALFWIDTH_KATAKANA_LETTER_KI 0xff77 516 #define HALFWIDTH_KATAKANA_LETTER_KU 0xff78 517 #define HALFWIDTH_KATAKANA_LETTER_KE 0xff79 518 #define HALFWIDTH_KATAKANA_LETTER_KO 0xff7a 519 520 #define HALFWIDTH_KATAKANA_LETTER_SA 0xff7b 521 #define HALFWIDTH_KATAKANA_LETTER_SI 0xff7c 522 #define HALFWIDTH_KATAKANA_LETTER_SU 0xff7d 523 #define HALFWIDTH_KATAKANA_LETTER_SE 0xff7e 524 #define HALFWIDTH_KATAKANA_LETTER_SO 0xff7f 525 526 #define HALFWIDTH_KATAKANA_LETTER_TA 0xff80 527 #define HALFWIDTH_KATAKANA_LETTER_TI 0xff81 528 #define HALFWIDTH_KATAKANA_LETTER_TU 0xff82 529 #define HALFWIDTH_KATAKANA_LETTER_TE 0xff83 530 #define HALFWIDTH_KATAKANA_LETTER_TO 0xff84 531 532 #define HALFWIDTH_KATAKANA_LETTER_HA 0xff8a 533 #define HALFWIDTH_KATAKANA_LETTER_HI 0xff8b 534 #define HALFWIDTH_KATAKANA_LETTER_HU 0xff8c 535 #define HALFWIDTH_KATAKANA_LETTER_HE 0xff8d 536 #define HALFWIDTH_KATAKANA_LETTER_HO 0xff8e 537 538 #define HALFWIDTH_KATAKANA_VOICED_SOUND_MARK 0xff9e 539 #define HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK 0xff9f 540 541 #define HIRAGANA_LETTER_KA 0x304b 542 #define HIRAGANA_VOICED_SOUND_MARK_OFFSET 1 543 #define HIRAGANA_VOICED_SOUND_MARK_GAP 2 544 545 #define HIRAGANA_LETTER_HA 0x306f 546 #define HIRAGANA_HA_LINE_BA_OFFSET 1 547 #define HIRAGANA_HA_LINE_PA_OFFSET 2 548 #define HIRAGANA_HA_LINE_GAP 3 549 550 static grn_bool 551 normalize_halfwidth_katakana_with_voiced_sound_mark( 552 grn_ctx *ctx, 553 const char *utf8, 554 int *character_length, 555 int rest_length, 556 GNUC_UNUSED uint32_t **normalize_table, 557 char *normalized, 558 unsigned int *normalized_character_length, 559 unsigned int *normalized_length_in_bytes, 560 unsigned int *normalized_n_characters) 561 { 562 grn_bool custom_normalized = GRN_FALSE; 563 grn_bool is_voiced_sound_markable_halfwidth_katakana = GRN_FALSE; 564 grn_bool is_semi_voiced_sound_markable_halfwidth_katakana = GRN_FALSE; 565 grn_bool is_ha_line = GRN_FALSE; 566 uint32_t unichar; 567 568 if (*character_length != 3) { 569 return GRN_FALSE; 570 } 571 if (rest_length < 3) { 572 return GRN_FALSE; 573 } 574 575 unichar = utf8_to_unichar(utf8, *character_length); 576 if (HALFWIDTH_KATAKANA_LETTER_KA <= unichar && 577 unichar <= HALFWIDTH_KATAKANA_LETTER_TO) { 578 is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE; 579 } else if (HALFWIDTH_KATAKANA_LETTER_HA <= unichar && 580 unichar <= HALFWIDTH_KATAKANA_LETTER_HO) { 581 is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE; 582 is_semi_voiced_sound_markable_halfwidth_katakana = GRN_TRUE; 583 is_ha_line = GRN_TRUE; 584 } 585 586 if (!is_voiced_sound_markable_halfwidth_katakana && 587 !is_semi_voiced_sound_markable_halfwidth_katakana) { 588 return GRN_FALSE; 589 } 590 591 { 592 int next_character_length; 593 uint32_t next_unichar; 594 next_character_length = grn_plugin_charlen(ctx, 595 utf8 + *character_length, 596 rest_length, 597 GRN_ENC_UTF8); 598 if (next_character_length != 3) { 599 return GRN_FALSE; 600 } 601 next_unichar = utf8_to_unichar(utf8 + *character_length, 602 next_character_length); 603 if (next_unichar == HALFWIDTH_KATAKANA_VOICED_SOUND_MARK) { 604 if (is_voiced_sound_markable_halfwidth_katakana) { 605 unsigned int n_bytes; 606 if (is_ha_line) { 607 n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA + 608 HIRAGANA_HA_LINE_BA_OFFSET + 609 ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) * 610 HIRAGANA_HA_LINE_GAP), 611 normalized + *normalized_length_in_bytes); 612 } else { 613 int small_tu_offset = 0; 614 if (HALFWIDTH_KATAKANA_LETTER_TU <= unichar && 615 unichar <= HALFWIDTH_KATAKANA_LETTER_TO) { 616 small_tu_offset = 1; 617 } 618 n_bytes = unichar_to_utf8(HIRAGANA_LETTER_KA + 619 HIRAGANA_VOICED_SOUND_MARK_OFFSET + 620 small_tu_offset + 621 ((unichar - HALFWIDTH_KATAKANA_LETTER_KA) * 622 HIRAGANA_VOICED_SOUND_MARK_GAP), 623 normalized + *normalized_length_in_bytes); 624 } 625 *character_length += next_character_length; 626 *normalized_character_length = n_bytes; 627 *normalized_length_in_bytes += n_bytes; 628 (*normalized_n_characters)++; 629 custom_normalized = GRN_TRUE; 630 } 631 } else if (next_unichar == HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK) { 632 if (is_semi_voiced_sound_markable_halfwidth_katakana) { 633 unsigned int n_bytes; 634 n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA + 635 HIRAGANA_HA_LINE_PA_OFFSET + 636 ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) * 637 HIRAGANA_HA_LINE_GAP), 638 normalized + *normalized_length_in_bytes); 639 *character_length += next_character_length; 640 *normalized_character_length = n_bytes; 641 *normalized_length_in_bytes += n_bytes; 642 (*normalized_n_characters)++; 643 custom_normalized = GRN_TRUE; 644 } 645 } 646 } 647 648 return custom_normalized; 649 } 650 651 static grn_obj * 652 mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next( 653 GNUC_UNUSED grn_ctx *ctx, 654 GNUC_UNUSED int nargs, 655 grn_obj **args, 656 GNUC_UNUSED grn_user_data *user_data) 657 { 658 grn_obj *string = args[0]; 659 grn_encoding encoding; 660 const char *normalizer_type_label = 661 "mysql-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark"; 662 663 encoding = grn_string_get_encoding(ctx, string); 664 if (encoding != GRN_ENC_UTF8) { 665 GRN_PLUGIN_ERROR(ctx, 666 GRN_FUNCTION_NOT_IMPLEMENTED, 667 "[normalizer][%s] " 668 "UTF-8 encoding is only supported: %s", 669 normalizer_type_label, 670 grn_encoding_to_string(encoding)); 671 return NULL; 672 } 673 normalize(ctx, string, 674 normalizer_type_label, 675 unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table, 676 sizeof(unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *), 677 normalize_halfwidth_katakana_with_voiced_sound_mark); 678 return NULL; 679 } 680 681 static grn_obj * 682 mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx *ctx, 683 GNUC_UNUSED int nargs, 684 grn_obj **args, 685 GNUC_UNUSED grn_user_data *user_data) 686 { 687 grn_obj *string = args[0]; 688 grn_encoding encoding; 689 const char *normalizer_type_label = "mysql-unicode-520-ci"; 690 691 encoding = grn_string_get_encoding(ctx, string); 692 if (encoding != GRN_ENC_UTF8) { 693 GRN_PLUGIN_ERROR(ctx, 694 GRN_FUNCTION_NOT_IMPLEMENTED, 695 "[normalizer][%s] " 696 "UTF-8 encoding is only supported: %s", 697 normalizer_type_label, 698 grn_encoding_to_string(encoding)); 699 return NULL; 700 } 701 normalize(ctx, string, normalizer_type_label, 702 unicode_520_ci_table, 703 sizeof(unicode_520_ci_table) / sizeof(uint32_t *), 704 NULL); 705 return NULL; 706 } 707 708 static grn_obj * 709 mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next( 710 GNUC_UNUSED grn_ctx *ctx, 711 GNUC_UNUSED int nargs, 712 grn_obj **args, 713 GNUC_UNUSED grn_user_data *user_data) 714 { 715 grn_obj *string = args[0]; 716 grn_encoding encoding; 717 const char *normalizer_type_label = 718 "mysql-unicode-520-ci-except-kana-ci-kana-with-voiced-sound-mark"; 719 720 encoding = grn_string_get_encoding(ctx, string); 721 if (encoding != GRN_ENC_UTF8) { 722 GRN_PLUGIN_ERROR(ctx, 723 GRN_FUNCTION_NOT_IMPLEMENTED, 724 "[normalizer][%s] " 725 "UTF-8 encoding is only supported: %s", 726 normalizer_type_label, 727 grn_encoding_to_string(encoding)); 728 return NULL; 729 } 730 normalize(ctx, string, 731 normalizer_type_label, 732 unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table, 733 sizeof(unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *), 734 normalize_halfwidth_katakana_with_voiced_sound_mark); 735 return NULL; 736 } 737 738 grn_rc 739 GRN_PLUGIN_INIT(grn_ctx *ctx) 740 { 741 return ctx->rc; 742 } 743 744 grn_rc 745 GRN_PLUGIN_REGISTER(grn_ctx *ctx) 746 { 747 grn_normalizer_register(ctx, "NormalizerMySQLGeneralCI", -1, 748 NULL, mysql_general_ci_next, NULL); 749 grn_normalizer_register(ctx, "NormalizerMySQLUnicodeCI", -1, 750 NULL, mysql_unicode_ci_next, NULL); 751 grn_normalizer_register(ctx, 752 "NormalizerMySQLUnicodeCI" 753 "Except" 754 "KanaCI" 755 "KanaWithVoicedSoundMark", 756 -1, 757 NULL, 758 mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next, 759 NULL); 760 grn_normalizer_register(ctx, "NormalizerMySQLUnicode520CI", -1, 761 NULL, mysql_unicode_520_ci_next, NULL); 762 grn_normalizer_register(ctx, 763 "NormalizerMySQLUnicode520CI" 764 "Except" 765 "KanaCI" 766 "KanaWithVoicedSoundMark", 767 -1, 768 NULL, 769 mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next, 770 NULL); 771 return GRN_SUCCESS; 772 } 773 774 grn_rc 775 GRN_PLUGIN_FIN(GNUC_UNUSED grn_ctx *ctx) 776 { 777 return GRN_SUCCESS; 778 } 779