1 /* -*- c-basic-offset: 2 -*- */
2 /*
3 Copyright(C) 2013-2015 Kouhei Sutou <kou@clear-code.com>
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; version 2
8 of the License.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
18 MA 02110-1335 USA
19 */
20
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #ifdef GROONGA_NORMALIZER_MYSQL_EMBED
26 # define GRN_PLUGIN_FUNCTION_TAG normalizers_mysql
27 #endif
28
29 #include <groonga/normalizer.h>
30 #include <groonga/nfkc.h>
31
32 #include <string.h>
33 #include <stdio.h>
34
35 #include "mysql_general_ci_table.h"
36 #include "mysql_unicode_ci_table.h"
37 #include "mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
38 #include "mysql_unicode_520_ci_table.h"
39 #include "mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
40
41 #ifdef __GNUC__
42 # define GNUC_UNUSED __attribute__((__unused__))
43 #else
44 # define GNUC_UNUSED
45 #endif
46
47 #ifdef _MSC_VER
48 # define inline _inline
49 # define snprintf _snprintf
50 #endif
51
52 #define SNIPPET_BUFFER_SIZE 256
53
54 typedef grn_bool (*normalizer_func)(grn_ctx *ctx,
55 const char *utf8,
56 int *character_length,
57 int rest_length,
58 uint32_t **normalize_table,
59 char *normalized,
60 unsigned int *normalized_characer_length,
61 unsigned int *normalized_length_in_bytes,
62 unsigned int *normalized_n_characters);
63
64 static inline unsigned int
unichar_to_utf8(uint32_t unichar,char * output)65 unichar_to_utf8(uint32_t unichar, char *output)
66 {
67 unsigned int n_bytes;
68
69 if (unichar < 0x80) {
70 output[0] = unichar;
71 n_bytes = 1;
72 } else if (unichar < 0x0800) {
73 output[0] = ((unichar >> 6) & 0x1f) | 0xc0;
74 output[1] = (unichar & 0x3f) | 0x80;
75 n_bytes = 2;
76 } else if (unichar < 0x10000) {
77 output[0] = (unichar >> 12) | 0xe0;
78 output[1] = ((unichar >> 6) & 0x3f) | 0x80;
79 output[2] = (unichar & 0x3f) | 0x80;
80 n_bytes = 3;
81 } else if (unichar < 0x200000) {
82 output[0] = (unichar >> 18) | 0xf0;
83 output[1] = ((unichar >> 12) & 0x3f) | 0x80;
84 output[2] = ((unichar >> 6) & 0x3f) | 0x80;
85 output[3] = (unichar & 0x3f) | 0x80;
86 n_bytes = 4;
87 } else if (unichar < 0x4000000) {
88 output[0] = (unichar >> 24) | 0xf8;
89 output[1] = ((unichar >> 18) & 0x3f) | 0x80;
90 output[2] = ((unichar >> 12) & 0x3f) | 0x80;
91 output[3] = ((unichar >> 6) & 0x3f) | 0x80;
92 output[4] = (unichar & 0x3f) | 0x80;
93 n_bytes = 5;
94 } else {
95 output[0] = (unichar >> 30) | 0xfc;
96 output[1] = ((unichar >> 24) & 0x3f) | 0x80;
97 output[2] = ((unichar >> 18) & 0x3f) | 0x80;
98 output[3] = ((unichar >> 12) & 0x3f) | 0x80;
99 output[4] = ((unichar >> 6) & 0x3f) | 0x80;
100 output[5] = (unichar & 0x3f) | 0x80;
101 n_bytes = 6;
102 }
103
104 return n_bytes;
105 }
106
107 static inline uint32_t
utf8_to_unichar(const char * utf8,int byte_size)108 utf8_to_unichar(const char *utf8, int byte_size)
109 {
110 uint32_t unichar;
111 const unsigned char *bytes = (const unsigned char *)utf8;
112
113 switch (byte_size) {
114 case 1 :
115 unichar = bytes[0] & 0x7f;
116 break;
117 case 2 :
118 unichar = ((bytes[0] & 0x1f) << 6) + (bytes[1] & 0x3f);
119 break;
120 case 3 :
121 unichar =
122 ((bytes[0] & 0x0f) << 12) +
123 ((bytes[1] & 0x3f) << 6) +
124 ((bytes[2] & 0x3f));
125 break;
126 case 4 :
127 unichar =
128 ((bytes[0] & 0x07) << 18) +
129 ((bytes[1] & 0x3f) << 12) +
130 ((bytes[2] & 0x3f) << 6) +
131 ((bytes[3] & 0x3f));
132 break;
133 case 5 :
134 unichar =
135 ((bytes[0] & 0x03) << 24) +
136 ((bytes[1] & 0x3f) << 18) +
137 ((bytes[2] & 0x3f) << 12) +
138 ((bytes[3] & 0x3f) << 6) +
139 ((bytes[4] & 0x3f));
140 break;
141 case 6 :
142 unichar =
143 ((bytes[0] & 0x01) << 30) +
144 ((bytes[1] & 0x3f) << 24) +
145 ((bytes[2] & 0x3f) << 18) +
146 ((bytes[3] & 0x3f) << 12) +
147 ((bytes[4] & 0x3f) << 6) +
148 ((bytes[5] & 0x3f));
149 break;
150 default :
151 unichar = 0;
152 break;
153 }
154
155 return unichar;
156 }
157
158 static inline void
decompose_character(const char * rest,int character_length,size_t * page,uint32_t * low_code)159 decompose_character(const char *rest, int character_length,
160 size_t *page, uint32_t *low_code)
161 {
162 switch (character_length) {
163 case 1 :
164 *page = 0x00;
165 *low_code = rest[0] & 0x7f;
166 break;
167 case 2 :
168 *page = (rest[0] & 0x1c) >> 2;
169 *low_code = ((rest[0] & 0x03) << 6) + (rest[1] & 0x3f);
170 break;
171 case 3 :
172 *page = ((rest[0] & 0x0f) << 4) + ((rest[1] & 0x3c) >> 2);
173 *low_code = ((rest[1] & 0x03) << 6) + (rest[2] & 0x3f);
174 break;
175 case 4 :
176 *page =
177 ((rest[0] & 0x07) << 10) +
178 ((rest[1] & 0x3f) << 4) +
179 ((rest[2] & 0x3c) >> 2);
180 *low_code = ((rest[2] & 0x03) << 6) + (rest[3] & 0x3f);
181 break;
182 case 5 :
183 *page =
184 ((rest[0] & 0x03) << 16) +
185 ((rest[1] & 0x3f) << 10) +
186 ((rest[2] & 0x3f) << 4) +
187 ((rest[3] & 0x3c) >> 2);
188 *low_code = ((rest[3] & 0x03) << 6) + (rest[4] & 0x3f);
189 break;
190 case 6 :
191 *page =
192 ((rest[0] & 0x01) << 22) +
193 ((rest[1] & 0x3f) << 16) +
194 ((rest[2] & 0x3f) << 10) +
195 ((rest[3] & 0x3f) << 4) +
196 ((rest[4] & 0x3c) >> 2);
197 *low_code = ((rest[4] & 0x03) << 6) + (rest[5] & 0x3f);
198 break;
199 default :
200 *page = (size_t)-1;
201 *low_code = 0x00;
202 break;
203 }
204 }
205
206 static inline void
normalize_character(const char * utf8,int character_length,uint32_t ** normalize_table,size_t normalize_table_size,char * normalized,unsigned int * normalized_character_length,unsigned int * normalized_length_in_bytes,unsigned int * normalized_n_characters)207 normalize_character(const char *utf8, int character_length,
208 uint32_t **normalize_table,
209 size_t normalize_table_size,
210 char *normalized,
211 unsigned int *normalized_character_length,
212 unsigned int *normalized_length_in_bytes,
213 unsigned int *normalized_n_characters)
214 {
215 size_t page;
216 uint32_t low_code;
217 decompose_character(utf8, character_length, &page, &low_code);
218 if (page < normalize_table_size && normalize_table[page]) {
219 uint32_t normalized_code;
220 unsigned int n_bytes;
221 normalized_code = normalize_table[page][low_code];
222 if (normalized_code == 0x00000) {
223 *normalized_character_length = 0;
224 } else {
225 n_bytes = unichar_to_utf8(normalized_code,
226 normalized + *normalized_length_in_bytes);
227 *normalized_character_length = n_bytes;
228 *normalized_length_in_bytes += n_bytes;
229 (*normalized_n_characters)++;
230 }
231 } else {
232 int i;
233 for (i = 0; i < character_length; i++) {
234 normalized[*normalized_length_in_bytes + i] = utf8[i];
235 }
236 *normalized_character_length = character_length;
237 *normalized_length_in_bytes += character_length;
238 (*normalized_n_characters)++;
239 }
240 }
241
242 static void
sized_buffer_append(char * buffer,unsigned int buffer_length,unsigned int * buffer_rest_length,const char * string)243 sized_buffer_append(char *buffer,
244 unsigned int buffer_length,
245 unsigned int *buffer_rest_length,
246 const char *string)
247 {
248 size_t string_length;
249
250 string_length = strlen(string);
251 if (string_length >= *buffer_rest_length) {
252 return;
253 }
254
255 strncat(buffer, string, buffer_length);
256 *buffer_rest_length -= string_length;
257 }
258
259 static void
sized_buffer_dump_string(char * buffer,unsigned int buffer_length,unsigned int * buffer_rest_length,const char * string,unsigned int string_length)260 sized_buffer_dump_string(char *buffer,
261 unsigned int buffer_length,
262 unsigned int *buffer_rest_length,
263 const char *string, unsigned int string_length)
264 {
265 const unsigned char *bytes;
266 unsigned int i;
267
268 bytes = (const unsigned char *)string;
269 for (i = 0; i < string_length; i++) {
270 unsigned char byte = bytes[i];
271 #define FORMATTED_BYTE_BUFFER_SIZE 5 /* "0xFF\0" */
272 char formatted_byte[FORMATTED_BYTE_BUFFER_SIZE];
273 if (i > 0) {
274 sized_buffer_append(buffer, buffer_length, buffer_rest_length,
275 " ");
276 }
277 if (byte == 0) {
278 strncpy(formatted_byte, "0x00", FORMATTED_BYTE_BUFFER_SIZE);
279 } else {
280 snprintf(formatted_byte, FORMATTED_BYTE_BUFFER_SIZE, "%#04x", byte);
281 }
282 sized_buffer_append(buffer, buffer_length, buffer_rest_length,
283 formatted_byte);
284 #undef FORMATTED_BYTE_BUFFER_SIZE
285 }
286 }
287
288 static const char *
snippet(const char * string,unsigned int length,unsigned int target_byte,char * buffer,unsigned int buffer_length)289 snippet(const char *string, unsigned int length, unsigned int target_byte,
290 char *buffer, unsigned int buffer_length)
291 {
292 const char *elision_mark = "...";
293 unsigned int max_window_length = 12;
294 unsigned int window_length;
295 unsigned int buffer_rest_length = buffer_length - 1;
296
297 buffer[0] = '\0';
298
299 if (target_byte > 0) {
300 sized_buffer_append(buffer, buffer_length, &buffer_rest_length,
301 elision_mark);
302 }
303
304 sized_buffer_append(buffer, buffer_length, &buffer_rest_length, "<");
305 if (target_byte + max_window_length > length) {
306 window_length = length - target_byte;
307 } else {
308 window_length = max_window_length;
309 }
310 sized_buffer_dump_string(buffer, buffer_length, &buffer_rest_length,
311 string + target_byte, window_length);
312 sized_buffer_append(buffer, buffer_length, &buffer_rest_length,
313 ">");
314
315 if (target_byte + window_length < length) {
316 sized_buffer_append(buffer, buffer_length, &buffer_rest_length,
317 elision_mark);
318 }
319
320 return buffer;
321 }
322
323 static void
normalize(grn_ctx * ctx,grn_obj * string,const char * normalizer_type_label,uint32_t ** normalize_table,size_t normalize_table_size,normalizer_func custom_normalizer)324 normalize(grn_ctx *ctx, grn_obj *string,
325 const char *normalizer_type_label,
326 uint32_t **normalize_table,
327 size_t normalize_table_size,
328 normalizer_func custom_normalizer)
329 {
330 const char *original, *rest;
331 unsigned int original_length_in_bytes, rest_length;
332 char *normalized;
333 unsigned int normalized_length_in_bytes = 0;
334 unsigned int normalized_n_characters = 0;
335 unsigned char *types = NULL;
336 unsigned char *current_type = NULL;
337 short *checks = NULL;
338 short *current_check = NULL;
339 grn_encoding encoding;
340 int flags;
341 grn_bool remove_blank_p;
342
343 encoding = grn_string_get_encoding(ctx, string);
344 flags = grn_string_get_flags(ctx, string);
345 remove_blank_p = flags & GRN_STRING_REMOVE_BLANK;
346 grn_string_get_original(ctx, string, &original, &original_length_in_bytes);
347 {
348 unsigned int max_normalized_length_in_bytes =
349 original_length_in_bytes + 1;
350 normalized = GRN_PLUGIN_MALLOC(ctx, max_normalized_length_in_bytes);
351 }
352 if (flags & GRN_STRING_WITH_TYPES) {
353 unsigned int max_normalized_n_characters = original_length_in_bytes + 1;
354 types = GRN_PLUGIN_MALLOC(ctx, max_normalized_n_characters);
355 current_type = types;
356 }
357 if (flags & GRN_STRING_WITH_CHECKS) {
358 unsigned int max_checks_size = sizeof(short) * original_length_in_bytes + 1;
359 checks = GRN_PLUGIN_MALLOC(ctx, max_checks_size);
360 current_check = checks;
361 current_check[0] = 0;
362 }
363 rest = original;
364 rest_length = original_length_in_bytes;
365 while (rest_length > 0) {
366 int character_length;
367 grn_bool custom_normalized = GRN_FALSE;
368 unsigned int normalized_character_length;
369 unsigned int previous_normalized_length_in_bytes =
370 normalized_length_in_bytes;
371 unsigned int previous_normalized_n_characters =
372 normalized_n_characters;
373
374 character_length = grn_plugin_charlen(ctx, rest, rest_length, encoding);
375 if (character_length == 0) {
376 break;
377 }
378
379 if (custom_normalizer) {
380 custom_normalized = custom_normalizer(ctx,
381 rest,
382 &character_length,
383 rest_length - character_length,
384 normalize_table,
385 normalized,
386 &normalized_character_length,
387 &normalized_length_in_bytes,
388 &normalized_n_characters);
389 }
390 if (!custom_normalized) {
391 normalize_character(rest, character_length,
392 normalize_table, normalize_table_size,
393 normalized,
394 &normalized_character_length,
395 &normalized_length_in_bytes,
396 &normalized_n_characters);
397 }
398
399 if (remove_blank_p &&
400 normalized_character_length == 1 &&
401 normalized[previous_normalized_length_in_bytes] == ' ') {
402 if (current_type > types) {
403 current_type[-1] |= GRN_CHAR_BLANK;
404 }
405 if (current_check) {
406 current_check[0]++;
407 }
408 normalized_length_in_bytes = previous_normalized_length_in_bytes;
409 normalized_n_characters = previous_normalized_n_characters;
410 } else {
411 if (current_type && normalized_character_length > 0) {
412 char *current_normalized;
413 current_normalized =
414 normalized + normalized_length_in_bytes - normalized_character_length;
415 current_type[0] =
416 grn_nfkc_char_type((unsigned char *)current_normalized);
417 current_type++;
418 }
419 if (current_check) {
420 current_check[0] += character_length;
421 if (normalized_character_length > 0) {
422 unsigned int i;
423 current_check++;
424 for (i = 1; i < normalized_character_length; i++) {
425 current_check[0] = 0;
426 current_check++;
427 }
428 current_check[0] = 0;
429 }
430 }
431 }
432
433 rest += character_length;
434 rest_length -= character_length;
435 }
436 if (current_type) {
437 current_type[0] = GRN_CHAR_NULL;
438 }
439 normalized[normalized_length_in_bytes] = '\0';
440
441 if (rest_length > 0) {
442 char buffer[SNIPPET_BUFFER_SIZE];
443 GRN_PLUGIN_LOG(ctx, GRN_LOG_DEBUG,
444 "[normalizer][%s] failed to normalize at %u byte: %s",
445 normalizer_type_label,
446 original_length_in_bytes - rest_length,
447 snippet(original,
448 original_length_in_bytes,
449 original_length_in_bytes - rest_length,
450 buffer,
451 SNIPPET_BUFFER_SIZE));
452 }
453 grn_string_set_normalized(ctx,
454 string,
455 normalized,
456 normalized_length_in_bytes,
457 normalized_n_characters);
458 grn_string_set_types(ctx, string, types);
459 grn_string_set_checks(ctx, string, checks);
460 }
461
462 static grn_obj *
mysql_general_ci_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)463 mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx,
464 GNUC_UNUSED int nargs,
465 grn_obj **args,
466 GNUC_UNUSED grn_user_data *user_data)
467 {
468 grn_obj *string = args[0];
469 grn_encoding encoding;
470 const char *normalizer_type_label = "mysql-general-ci";
471
472 encoding = grn_string_get_encoding(ctx, string);
473 if (encoding != GRN_ENC_UTF8) {
474 GRN_PLUGIN_ERROR(ctx,
475 GRN_FUNCTION_NOT_IMPLEMENTED,
476 "[normalizer][%s] "
477 "UTF-8 encoding is only supported: %s",
478 normalizer_type_label,
479 grn_encoding_to_string(encoding));
480 return NULL;
481 }
482 normalize(ctx, string, normalizer_type_label,
483 general_ci_table, sizeof(general_ci_table) / sizeof(uint32_t *),
484 NULL);
485 return NULL;
486 }
487
488 static grn_obj *
mysql_unicode_ci_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)489 mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx,
490 GNUC_UNUSED int nargs,
491 grn_obj **args,
492 GNUC_UNUSED grn_user_data *user_data)
493 {
494 grn_obj *string = args[0];
495 grn_encoding encoding;
496 const char *normalizer_type_label = "mysql-unicode-ci";
497
498 encoding = grn_string_get_encoding(ctx, string);
499 if (encoding != GRN_ENC_UTF8) {
500 GRN_PLUGIN_ERROR(ctx,
501 GRN_FUNCTION_NOT_IMPLEMENTED,
502 "[normalizer][%s] "
503 "UTF-8 encoding is only supported: %s",
504 normalizer_type_label,
505 grn_encoding_to_string(encoding));
506 return NULL;
507 }
508 normalize(ctx, string, normalizer_type_label,
509 unicode_ci_table, sizeof(unicode_ci_table) / sizeof(uint32_t *),
510 NULL);
511 return NULL;
512 }
513
514 #define HALFWIDTH_KATAKANA_LETTER_KA 0xff76
515 #define HALFWIDTH_KATAKANA_LETTER_KI 0xff77
516 #define HALFWIDTH_KATAKANA_LETTER_KU 0xff78
517 #define HALFWIDTH_KATAKANA_LETTER_KE 0xff79
518 #define HALFWIDTH_KATAKANA_LETTER_KO 0xff7a
519
520 #define HALFWIDTH_KATAKANA_LETTER_SA 0xff7b
521 #define HALFWIDTH_KATAKANA_LETTER_SI 0xff7c
522 #define HALFWIDTH_KATAKANA_LETTER_SU 0xff7d
523 #define HALFWIDTH_KATAKANA_LETTER_SE 0xff7e
524 #define HALFWIDTH_KATAKANA_LETTER_SO 0xff7f
525
526 #define HALFWIDTH_KATAKANA_LETTER_TA 0xff80
527 #define HALFWIDTH_KATAKANA_LETTER_TI 0xff81
528 #define HALFWIDTH_KATAKANA_LETTER_TU 0xff82
529 #define HALFWIDTH_KATAKANA_LETTER_TE 0xff83
530 #define HALFWIDTH_KATAKANA_LETTER_TO 0xff84
531
532 #define HALFWIDTH_KATAKANA_LETTER_HA 0xff8a
533 #define HALFWIDTH_KATAKANA_LETTER_HI 0xff8b
534 #define HALFWIDTH_KATAKANA_LETTER_HU 0xff8c
535 #define HALFWIDTH_KATAKANA_LETTER_HE 0xff8d
536 #define HALFWIDTH_KATAKANA_LETTER_HO 0xff8e
537
538 #define HALFWIDTH_KATAKANA_VOICED_SOUND_MARK 0xff9e
539 #define HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK 0xff9f
540
541 #define HIRAGANA_LETTER_KA 0x304b
542 #define HIRAGANA_VOICED_SOUND_MARK_OFFSET 1
543 #define HIRAGANA_VOICED_SOUND_MARK_GAP 2
544
545 #define HIRAGANA_LETTER_HA 0x306f
546 #define HIRAGANA_HA_LINE_BA_OFFSET 1
547 #define HIRAGANA_HA_LINE_PA_OFFSET 2
548 #define HIRAGANA_HA_LINE_GAP 3
549
550 static grn_bool
normalize_halfwidth_katakana_with_voiced_sound_mark(grn_ctx * ctx,const char * utf8,int * character_length,int rest_length,GNUC_UNUSED uint32_t ** normalize_table,char * normalized,unsigned int * normalized_character_length,unsigned int * normalized_length_in_bytes,unsigned int * normalized_n_characters)551 normalize_halfwidth_katakana_with_voiced_sound_mark(
552 grn_ctx *ctx,
553 const char *utf8,
554 int *character_length,
555 int rest_length,
556 GNUC_UNUSED uint32_t **normalize_table,
557 char *normalized,
558 unsigned int *normalized_character_length,
559 unsigned int *normalized_length_in_bytes,
560 unsigned int *normalized_n_characters)
561 {
562 grn_bool custom_normalized = GRN_FALSE;
563 grn_bool is_voiced_sound_markable_halfwidth_katakana = GRN_FALSE;
564 grn_bool is_semi_voiced_sound_markable_halfwidth_katakana = GRN_FALSE;
565 grn_bool is_ha_line = GRN_FALSE;
566 uint32_t unichar;
567
568 if (*character_length != 3) {
569 return GRN_FALSE;
570 }
571 if (rest_length < 3) {
572 return GRN_FALSE;
573 }
574
575 unichar = utf8_to_unichar(utf8, *character_length);
576 if (HALFWIDTH_KATAKANA_LETTER_KA <= unichar &&
577 unichar <= HALFWIDTH_KATAKANA_LETTER_TO) {
578 is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE;
579 } else if (HALFWIDTH_KATAKANA_LETTER_HA <= unichar &&
580 unichar <= HALFWIDTH_KATAKANA_LETTER_HO) {
581 is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE;
582 is_semi_voiced_sound_markable_halfwidth_katakana = GRN_TRUE;
583 is_ha_line = GRN_TRUE;
584 }
585
586 if (!is_voiced_sound_markable_halfwidth_katakana &&
587 !is_semi_voiced_sound_markable_halfwidth_katakana) {
588 return GRN_FALSE;
589 }
590
591 {
592 int next_character_length;
593 uint32_t next_unichar;
594 next_character_length = grn_plugin_charlen(ctx,
595 utf8 + *character_length,
596 rest_length,
597 GRN_ENC_UTF8);
598 if (next_character_length != 3) {
599 return GRN_FALSE;
600 }
601 next_unichar = utf8_to_unichar(utf8 + *character_length,
602 next_character_length);
603 if (next_unichar == HALFWIDTH_KATAKANA_VOICED_SOUND_MARK) {
604 if (is_voiced_sound_markable_halfwidth_katakana) {
605 unsigned int n_bytes;
606 if (is_ha_line) {
607 n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA +
608 HIRAGANA_HA_LINE_BA_OFFSET +
609 ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) *
610 HIRAGANA_HA_LINE_GAP),
611 normalized + *normalized_length_in_bytes);
612 } else {
613 int small_tu_offset = 0;
614 if (HALFWIDTH_KATAKANA_LETTER_TU <= unichar &&
615 unichar <= HALFWIDTH_KATAKANA_LETTER_TO) {
616 small_tu_offset = 1;
617 }
618 n_bytes = unichar_to_utf8(HIRAGANA_LETTER_KA +
619 HIRAGANA_VOICED_SOUND_MARK_OFFSET +
620 small_tu_offset +
621 ((unichar - HALFWIDTH_KATAKANA_LETTER_KA) *
622 HIRAGANA_VOICED_SOUND_MARK_GAP),
623 normalized + *normalized_length_in_bytes);
624 }
625 *character_length += next_character_length;
626 *normalized_character_length = n_bytes;
627 *normalized_length_in_bytes += n_bytes;
628 (*normalized_n_characters)++;
629 custom_normalized = GRN_TRUE;
630 }
631 } else if (next_unichar == HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK) {
632 if (is_semi_voiced_sound_markable_halfwidth_katakana) {
633 unsigned int n_bytes;
634 n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA +
635 HIRAGANA_HA_LINE_PA_OFFSET +
636 ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) *
637 HIRAGANA_HA_LINE_GAP),
638 normalized + *normalized_length_in_bytes);
639 *character_length += next_character_length;
640 *normalized_character_length = n_bytes;
641 *normalized_length_in_bytes += n_bytes;
642 (*normalized_n_characters)++;
643 custom_normalized = GRN_TRUE;
644 }
645 }
646 }
647
648 return custom_normalized;
649 }
650
651 static grn_obj *
mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)652 mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
653 GNUC_UNUSED grn_ctx *ctx,
654 GNUC_UNUSED int nargs,
655 grn_obj **args,
656 GNUC_UNUSED grn_user_data *user_data)
657 {
658 grn_obj *string = args[0];
659 grn_encoding encoding;
660 const char *normalizer_type_label =
661 "mysql-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark";
662
663 encoding = grn_string_get_encoding(ctx, string);
664 if (encoding != GRN_ENC_UTF8) {
665 GRN_PLUGIN_ERROR(ctx,
666 GRN_FUNCTION_NOT_IMPLEMENTED,
667 "[normalizer][%s] "
668 "UTF-8 encoding is only supported: %s",
669 normalizer_type_label,
670 grn_encoding_to_string(encoding));
671 return NULL;
672 }
673 normalize(ctx, string,
674 normalizer_type_label,
675 unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table,
676 sizeof(unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *),
677 normalize_halfwidth_katakana_with_voiced_sound_mark);
678 return NULL;
679 }
680
681 static grn_obj *
mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)682 mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx *ctx,
683 GNUC_UNUSED int nargs,
684 grn_obj **args,
685 GNUC_UNUSED grn_user_data *user_data)
686 {
687 grn_obj *string = args[0];
688 grn_encoding encoding;
689 const char *normalizer_type_label = "mysql-unicode-520-ci";
690
691 encoding = grn_string_get_encoding(ctx, string);
692 if (encoding != GRN_ENC_UTF8) {
693 GRN_PLUGIN_ERROR(ctx,
694 GRN_FUNCTION_NOT_IMPLEMENTED,
695 "[normalizer][%s] "
696 "UTF-8 encoding is only supported: %s",
697 normalizer_type_label,
698 grn_encoding_to_string(encoding));
699 return NULL;
700 }
701 normalize(ctx, string, normalizer_type_label,
702 unicode_520_ci_table,
703 sizeof(unicode_520_ci_table) / sizeof(uint32_t *),
704 NULL);
705 return NULL;
706 }
707
708 static grn_obj *
mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)709 mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
710 GNUC_UNUSED grn_ctx *ctx,
711 GNUC_UNUSED int nargs,
712 grn_obj **args,
713 GNUC_UNUSED grn_user_data *user_data)
714 {
715 grn_obj *string = args[0];
716 grn_encoding encoding;
717 const char *normalizer_type_label =
718 "mysql-unicode-520-ci-except-kana-ci-kana-with-voiced-sound-mark";
719
720 encoding = grn_string_get_encoding(ctx, string);
721 if (encoding != GRN_ENC_UTF8) {
722 GRN_PLUGIN_ERROR(ctx,
723 GRN_FUNCTION_NOT_IMPLEMENTED,
724 "[normalizer][%s] "
725 "UTF-8 encoding is only supported: %s",
726 normalizer_type_label,
727 grn_encoding_to_string(encoding));
728 return NULL;
729 }
730 normalize(ctx, string,
731 normalizer_type_label,
732 unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table,
733 sizeof(unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *),
734 normalize_halfwidth_katakana_with_voiced_sound_mark);
735 return NULL;
736 }
737
738 grn_rc
GRN_PLUGIN_INIT(grn_ctx * ctx)739 GRN_PLUGIN_INIT(grn_ctx *ctx)
740 {
741 return ctx->rc;
742 }
743
744 grn_rc
GRN_PLUGIN_REGISTER(grn_ctx * ctx)745 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
746 {
747 grn_normalizer_register(ctx, "NormalizerMySQLGeneralCI", -1,
748 NULL, mysql_general_ci_next, NULL);
749 grn_normalizer_register(ctx, "NormalizerMySQLUnicodeCI", -1,
750 NULL, mysql_unicode_ci_next, NULL);
751 grn_normalizer_register(ctx,
752 "NormalizerMySQLUnicodeCI"
753 "Except"
754 "KanaCI"
755 "KanaWithVoicedSoundMark",
756 -1,
757 NULL,
758 mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next,
759 NULL);
760 grn_normalizer_register(ctx, "NormalizerMySQLUnicode520CI", -1,
761 NULL, mysql_unicode_520_ci_next, NULL);
762 grn_normalizer_register(ctx,
763 "NormalizerMySQLUnicode520CI"
764 "Except"
765 "KanaCI"
766 "KanaWithVoicedSoundMark",
767 -1,
768 NULL,
769 mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next,
770 NULL);
771 return GRN_SUCCESS;
772 }
773
774 grn_rc
GRN_PLUGIN_FIN(GNUC_UNUSED grn_ctx * ctx)775 GRN_PLUGIN_FIN(GNUC_UNUSED grn_ctx *ctx)
776 {
777 return GRN_SUCCESS;
778 }
779