1 /* -*- c-basic-offset: 2 -*- */
2 /*
3   Copyright(C) 2013-2015  Kouhei Sutou <kou@clear-code.com>
4 
5   This library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Library General Public
7   License as published by the Free Software Foundation; version 2
8   of the License.
9 
10   This library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Library General Public License for more details.
14 
15   You should have received a copy of the GNU Library General Public
16   License along with this library; if not, write to the Free
17   Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
18   MA 02110-1335  USA
19 */
20 
21 #ifdef HAVE_CONFIG_H
22 #  include <config.h>
23 #endif
24 
25 #ifdef GROONGA_NORMALIZER_MYSQL_EMBED
26 #  define GRN_PLUGIN_FUNCTION_TAG normalizers_mysql
27 #endif
28 
29 #include <groonga/normalizer.h>
30 #include <groonga/nfkc.h>
31 
32 #include <string.h>
33 #include <stdio.h>
34 
35 #include "mysql_general_ci_table.h"
36 #include "mysql_unicode_ci_table.h"
37 #include "mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
38 #include "mysql_unicode_520_ci_table.h"
39 #include "mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table.h"
40 
41 #ifdef __GNUC__
42 #  define GNUC_UNUSED __attribute__((__unused__))
43 #else
44 #  define GNUC_UNUSED
45 #endif
46 
47 #ifdef _MSC_VER
48 #  define inline _inline
49 #  define snprintf _snprintf
50 #endif
51 
52 #define SNIPPET_BUFFER_SIZE 256
53 
54 typedef grn_bool (*normalizer_func)(grn_ctx *ctx,
55                                     const char *utf8,
56                                     int *character_length,
57                                     int rest_length,
58                                     uint32_t **normalize_table,
59                                     char *normalized,
60                                     unsigned int *normalized_characer_length,
61                                     unsigned int *normalized_length_in_bytes,
62                                     unsigned int *normalized_n_characters);
63 
64 static inline unsigned int
unichar_to_utf8(uint32_t unichar,char * output)65 unichar_to_utf8(uint32_t unichar, char *output)
66 {
67   unsigned int n_bytes;
68 
69   if (unichar < 0x80) {
70     output[0] = unichar;
71     n_bytes = 1;
72   } else if (unichar < 0x0800) {
73     output[0] = ((unichar >> 6) & 0x1f) | 0xc0;
74     output[1] = (unichar & 0x3f) | 0x80;
75     n_bytes = 2;
76   } else if (unichar < 0x10000) {
77     output[0] = (unichar >> 12) | 0xe0;
78     output[1] = ((unichar >> 6) & 0x3f) | 0x80;
79     output[2] = (unichar & 0x3f) | 0x80;
80     n_bytes = 3;
81   } else if (unichar < 0x200000) {
82     output[0] = (unichar >> 18) | 0xf0;
83     output[1] = ((unichar >> 12) & 0x3f) | 0x80;
84     output[2] = ((unichar >> 6) & 0x3f) | 0x80;
85     output[3] = (unichar & 0x3f) | 0x80;
86     n_bytes = 4;
87   } else if (unichar < 0x4000000) {
88     output[0] = (unichar >> 24) | 0xf8;
89     output[1] = ((unichar >> 18) & 0x3f) | 0x80;
90     output[2] = ((unichar >> 12) & 0x3f) | 0x80;
91     output[3] = ((unichar >> 6) & 0x3f) | 0x80;
92     output[4] = (unichar & 0x3f) | 0x80;
93     n_bytes = 5;
94   } else {
95     output[0] = (unichar >> 30) | 0xfc;
96     output[1] = ((unichar >> 24) & 0x3f) | 0x80;
97     output[2] = ((unichar >> 18) & 0x3f) | 0x80;
98     output[3] = ((unichar >> 12) & 0x3f) | 0x80;
99     output[4] = ((unichar >> 6) & 0x3f) | 0x80;
100     output[5] = (unichar & 0x3f) | 0x80;
101     n_bytes = 6;
102   }
103 
104   return n_bytes;
105 }
106 
107 static inline uint32_t
utf8_to_unichar(const char * utf8,int byte_size)108 utf8_to_unichar(const char *utf8, int byte_size)
109 {
110   uint32_t unichar;
111   const unsigned char *bytes = (const unsigned char *)utf8;
112 
113   switch (byte_size) {
114   case 1 :
115     unichar = bytes[0] & 0x7f;
116     break;
117   case 2 :
118     unichar = ((bytes[0] & 0x1f) << 6) + (bytes[1] & 0x3f);
119     break;
120   case 3 :
121     unichar =
122       ((bytes[0] & 0x0f) << 12) +
123       ((bytes[1] & 0x3f) << 6) +
124       ((bytes[2] & 0x3f));
125     break;
126   case 4 :
127     unichar =
128       ((bytes[0] & 0x07) << 18) +
129       ((bytes[1] & 0x3f) << 12) +
130       ((bytes[2] & 0x3f) << 6) +
131       ((bytes[3] & 0x3f));
132     break;
133   case 5 :
134     unichar =
135       ((bytes[0] & 0x03) << 24) +
136       ((bytes[1] & 0x3f) << 18) +
137       ((bytes[2] & 0x3f) << 12) +
138       ((bytes[3] & 0x3f) << 6) +
139       ((bytes[4] & 0x3f));
140     break;
141   case 6 :
142     unichar =
143       ((bytes[0] & 0x01) << 30) +
144       ((bytes[1] & 0x3f) << 24) +
145       ((bytes[2] & 0x3f) << 18) +
146       ((bytes[3] & 0x3f) << 12) +
147       ((bytes[4] & 0x3f) << 6) +
148       ((bytes[5] & 0x3f));
149     break;
150   default :
151     unichar = 0;
152     break;
153   }
154 
155   return unichar;
156 }
157 
158 static inline void
decompose_character(const char * rest,int character_length,size_t * page,uint32_t * low_code)159 decompose_character(const char *rest, int character_length,
160                     size_t *page, uint32_t *low_code)
161 {
162   switch (character_length) {
163   case 1 :
164     *page = 0x00;
165     *low_code = rest[0] & 0x7f;
166     break;
167   case 2 :
168     *page = (rest[0] & 0x1c) >> 2;
169     *low_code = ((rest[0] & 0x03) << 6) + (rest[1] & 0x3f);
170     break;
171   case 3 :
172     *page = ((rest[0] & 0x0f) << 4) + ((rest[1] & 0x3c) >> 2);
173     *low_code = ((rest[1] & 0x03) << 6) + (rest[2] & 0x3f);
174     break;
175   case 4 :
176     *page =
177       ((rest[0] & 0x07) << 10) +
178       ((rest[1] & 0x3f) << 4) +
179       ((rest[2] & 0x3c) >> 2);
180     *low_code = ((rest[2] & 0x03) << 6) + (rest[3] & 0x3f);
181     break;
182   case 5 :
183     *page =
184       ((rest[0] & 0x03) << 16) +
185       ((rest[1] & 0x3f) << 10) +
186       ((rest[2] & 0x3f) << 4) +
187       ((rest[3] & 0x3c) >> 2);
188     *low_code = ((rest[3] & 0x03) << 6) + (rest[4] & 0x3f);
189     break;
190   case 6 :
191     *page =
192       ((rest[0] & 0x01) << 22) +
193       ((rest[1] & 0x3f) << 16) +
194       ((rest[2] & 0x3f) << 10) +
195       ((rest[3] & 0x3f) << 4) +
196       ((rest[4] & 0x3c) >> 2);
197     *low_code = ((rest[4] & 0x03) << 6) + (rest[5] & 0x3f);
198     break;
199   default :
200     *page = (size_t)-1;
201     *low_code = 0x00;
202     break;
203   }
204 }
205 
206 static inline void
normalize_character(const char * utf8,int character_length,uint32_t ** normalize_table,size_t normalize_table_size,char * normalized,unsigned int * normalized_character_length,unsigned int * normalized_length_in_bytes,unsigned int * normalized_n_characters)207 normalize_character(const char *utf8, int character_length,
208                     uint32_t **normalize_table,
209                     size_t normalize_table_size,
210                     char *normalized,
211                     unsigned int *normalized_character_length,
212                     unsigned int *normalized_length_in_bytes,
213                     unsigned int *normalized_n_characters)
214 {
215   size_t page;
216   uint32_t low_code;
217   decompose_character(utf8, character_length, &page, &low_code);
218   if (page < normalize_table_size && normalize_table[page]) {
219     uint32_t normalized_code;
220     unsigned int n_bytes;
221     normalized_code = normalize_table[page][low_code];
222     if (normalized_code == 0x00000) {
223       *normalized_character_length = 0;
224     } else {
225       n_bytes = unichar_to_utf8(normalized_code,
226                                 normalized + *normalized_length_in_bytes);
227       *normalized_character_length = n_bytes;
228       *normalized_length_in_bytes += n_bytes;
229       (*normalized_n_characters)++;
230     }
231   } else {
232     int i;
233     for (i = 0; i < character_length; i++) {
234       normalized[*normalized_length_in_bytes + i] = utf8[i];
235     }
236     *normalized_character_length = character_length;
237     *normalized_length_in_bytes += character_length;
238     (*normalized_n_characters)++;
239   }
240 }
241 
242 static void
sized_buffer_append(char * buffer,unsigned int buffer_length,unsigned int * buffer_rest_length,const char * string)243 sized_buffer_append(char *buffer,
244                     unsigned int buffer_length,
245                     unsigned int *buffer_rest_length,
246                     const char *string)
247 {
248   size_t string_length;
249 
250   string_length = strlen(string);
251   if (string_length >= *buffer_rest_length) {
252     return;
253   }
254 
255   strncat(buffer, string, buffer_length);
256   *buffer_rest_length -= string_length;
257 }
258 
259 static void
sized_buffer_dump_string(char * buffer,unsigned int buffer_length,unsigned int * buffer_rest_length,const char * string,unsigned int string_length)260 sized_buffer_dump_string(char *buffer,
261                          unsigned int buffer_length,
262                          unsigned int *buffer_rest_length,
263                          const char *string, unsigned int string_length)
264 {
265   const unsigned char *bytes;
266   unsigned int i;
267 
268   bytes = (const unsigned char *)string;
269   for (i = 0; i < string_length; i++) {
270     unsigned char byte = bytes[i];
271 #define FORMATTED_BYTE_BUFFER_SIZE 5 /* "0xFF\0" */
272     char formatted_byte[FORMATTED_BYTE_BUFFER_SIZE];
273     if (i > 0) {
274       sized_buffer_append(buffer, buffer_length, buffer_rest_length,
275                           " ");
276     }
277     if (byte == 0) {
278       strncpy(formatted_byte, "0x00", FORMATTED_BYTE_BUFFER_SIZE);
279     } else {
280       snprintf(formatted_byte, FORMATTED_BYTE_BUFFER_SIZE, "%#04x", byte);
281     }
282     sized_buffer_append(buffer, buffer_length, buffer_rest_length,
283                         formatted_byte);
284 #undef FORMATTED_BYTE_BUFFER_SIZE
285   }
286 }
287 
288 static const char *
snippet(const char * string,unsigned int length,unsigned int target_byte,char * buffer,unsigned int buffer_length)289 snippet(const char *string, unsigned int length, unsigned int target_byte,
290         char *buffer, unsigned int buffer_length)
291 {
292   const char *elision_mark = "...";
293   unsigned int max_window_length = 12;
294   unsigned int window_length;
295   unsigned int buffer_rest_length = buffer_length - 1;
296 
297   buffer[0] = '\0';
298 
299   if (target_byte > 0) {
300     sized_buffer_append(buffer, buffer_length, &buffer_rest_length,
301                         elision_mark);
302   }
303 
304   sized_buffer_append(buffer, buffer_length, &buffer_rest_length, "<");
305   if (target_byte + max_window_length > length) {
306     window_length = length - target_byte;
307   } else {
308     window_length = max_window_length;
309   }
310   sized_buffer_dump_string(buffer, buffer_length, &buffer_rest_length,
311                            string + target_byte, window_length);
312   sized_buffer_append(buffer, buffer_length, &buffer_rest_length,
313                       ">");
314 
315   if (target_byte + window_length < length) {
316     sized_buffer_append(buffer, buffer_length, &buffer_rest_length,
317                         elision_mark);
318   }
319 
320   return buffer;
321 }
322 
323 static void
normalize(grn_ctx * ctx,grn_obj * string,const char * normalizer_type_label,uint32_t ** normalize_table,size_t normalize_table_size,normalizer_func custom_normalizer)324 normalize(grn_ctx *ctx, grn_obj *string,
325           const char *normalizer_type_label,
326           uint32_t **normalize_table,
327           size_t normalize_table_size,
328           normalizer_func custom_normalizer)
329 {
330   const char *original, *rest;
331   unsigned int original_length_in_bytes, rest_length;
332   char *normalized;
333   unsigned int normalized_length_in_bytes = 0;
334   unsigned int normalized_n_characters = 0;
335   unsigned char *types = NULL;
336   unsigned char *current_type = NULL;
337   short *checks = NULL;
338   short *current_check = NULL;
339   grn_encoding encoding;
340   int flags;
341   grn_bool remove_blank_p;
342 
343   encoding = grn_string_get_encoding(ctx, string);
344   flags = grn_string_get_flags(ctx, string);
345   remove_blank_p = flags & GRN_STRING_REMOVE_BLANK;
346   grn_string_get_original(ctx, string, &original, &original_length_in_bytes);
347   {
348     unsigned int max_normalized_length_in_bytes =
349       original_length_in_bytes + 1;
350     normalized = GRN_PLUGIN_MALLOC(ctx, max_normalized_length_in_bytes);
351   }
352   if (flags & GRN_STRING_WITH_TYPES) {
353     unsigned int max_normalized_n_characters = original_length_in_bytes + 1;
354     types = GRN_PLUGIN_MALLOC(ctx, max_normalized_n_characters);
355     current_type = types;
356   }
357   if (flags & GRN_STRING_WITH_CHECKS) {
358     unsigned int max_checks_size = sizeof(short) * original_length_in_bytes + 1;
359     checks = GRN_PLUGIN_MALLOC(ctx, max_checks_size);
360     current_check = checks;
361     current_check[0] = 0;
362   }
363   rest = original;
364   rest_length = original_length_in_bytes;
365   while (rest_length > 0) {
366     int character_length;
367     grn_bool custom_normalized = GRN_FALSE;
368     unsigned int normalized_character_length;
369     unsigned int previous_normalized_length_in_bytes =
370       normalized_length_in_bytes;
371     unsigned int previous_normalized_n_characters =
372       normalized_n_characters;
373 
374     character_length = grn_plugin_charlen(ctx, rest, rest_length, encoding);
375     if (character_length == 0) {
376       break;
377     }
378 
379     if (custom_normalizer) {
380       custom_normalized = custom_normalizer(ctx,
381                                             rest,
382                                             &character_length,
383                                             rest_length - character_length,
384                                             normalize_table,
385                                             normalized,
386                                             &normalized_character_length,
387                                             &normalized_length_in_bytes,
388                                             &normalized_n_characters);
389     }
390     if (!custom_normalized) {
391       normalize_character(rest, character_length,
392                           normalize_table, normalize_table_size,
393                           normalized,
394                           &normalized_character_length,
395                           &normalized_length_in_bytes,
396                           &normalized_n_characters);
397     }
398 
399     if (remove_blank_p &&
400         normalized_character_length == 1 &&
401         normalized[previous_normalized_length_in_bytes] == ' ') {
402       if (current_type > types) {
403         current_type[-1] |= GRN_CHAR_BLANK;
404       }
405       if (current_check) {
406         current_check[0]++;
407       }
408       normalized_length_in_bytes = previous_normalized_length_in_bytes;
409       normalized_n_characters = previous_normalized_n_characters;
410     } else {
411       if (current_type && normalized_character_length > 0) {
412         char *current_normalized;
413         current_normalized =
414           normalized + normalized_length_in_bytes - normalized_character_length;
415         current_type[0] =
416           grn_nfkc_char_type((unsigned char *)current_normalized);
417         current_type++;
418       }
419       if (current_check) {
420         current_check[0] += character_length;
421         if (normalized_character_length > 0) {
422           unsigned int i;
423           current_check++;
424           for (i = 1; i < normalized_character_length; i++) {
425             current_check[0] = 0;
426             current_check++;
427           }
428           current_check[0] = 0;
429         }
430       }
431     }
432 
433     rest += character_length;
434     rest_length -= character_length;
435   }
436   if (current_type) {
437     current_type[0] = GRN_CHAR_NULL;
438   }
439   normalized[normalized_length_in_bytes] = '\0';
440 
441   if (rest_length > 0) {
442     char buffer[SNIPPET_BUFFER_SIZE];
443     GRN_PLUGIN_LOG(ctx, GRN_LOG_DEBUG,
444                    "[normalizer][%s] failed to normalize at %u byte: %s",
445                    normalizer_type_label,
446                    original_length_in_bytes - rest_length,
447                    snippet(original,
448                            original_length_in_bytes,
449                            original_length_in_bytes - rest_length,
450                            buffer,
451                            SNIPPET_BUFFER_SIZE));
452   }
453   grn_string_set_normalized(ctx,
454                             string,
455                             normalized,
456                             normalized_length_in_bytes,
457                             normalized_n_characters);
458   grn_string_set_types(ctx, string, types);
459   grn_string_set_checks(ctx, string, checks);
460 }
461 
462 static grn_obj *
mysql_general_ci_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)463 mysql_general_ci_next(GNUC_UNUSED grn_ctx *ctx,
464                       GNUC_UNUSED int nargs,
465                       grn_obj **args,
466                       GNUC_UNUSED grn_user_data *user_data)
467 {
468   grn_obj *string = args[0];
469   grn_encoding encoding;
470   const char *normalizer_type_label = "mysql-general-ci";
471 
472   encoding = grn_string_get_encoding(ctx, string);
473   if (encoding != GRN_ENC_UTF8) {
474     GRN_PLUGIN_ERROR(ctx,
475                      GRN_FUNCTION_NOT_IMPLEMENTED,
476                      "[normalizer][%s] "
477                      "UTF-8 encoding is only supported: %s",
478                      normalizer_type_label,
479                      grn_encoding_to_string(encoding));
480     return NULL;
481   }
482   normalize(ctx, string, normalizer_type_label,
483             general_ci_table, sizeof(general_ci_table) / sizeof(uint32_t *),
484             NULL);
485   return NULL;
486 }
487 
488 static grn_obj *
mysql_unicode_ci_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)489 mysql_unicode_ci_next(GNUC_UNUSED grn_ctx *ctx,
490                       GNUC_UNUSED int nargs,
491                       grn_obj **args,
492                       GNUC_UNUSED grn_user_data *user_data)
493 {
494   grn_obj *string = args[0];
495   grn_encoding encoding;
496   const char *normalizer_type_label = "mysql-unicode-ci";
497 
498   encoding = grn_string_get_encoding(ctx, string);
499   if (encoding != GRN_ENC_UTF8) {
500     GRN_PLUGIN_ERROR(ctx,
501                      GRN_FUNCTION_NOT_IMPLEMENTED,
502                      "[normalizer][%s] "
503                      "UTF-8 encoding is only supported: %s",
504                      normalizer_type_label,
505                      grn_encoding_to_string(encoding));
506     return NULL;
507   }
508   normalize(ctx, string, normalizer_type_label,
509             unicode_ci_table, sizeof(unicode_ci_table) / sizeof(uint32_t *),
510             NULL);
511   return NULL;
512 }
513 
514 #define HALFWIDTH_KATAKANA_LETTER_KA 0xff76
515 #define HALFWIDTH_KATAKANA_LETTER_KI 0xff77
516 #define HALFWIDTH_KATAKANA_LETTER_KU 0xff78
517 #define HALFWIDTH_KATAKANA_LETTER_KE 0xff79
518 #define HALFWIDTH_KATAKANA_LETTER_KO 0xff7a
519 
520 #define HALFWIDTH_KATAKANA_LETTER_SA 0xff7b
521 #define HALFWIDTH_KATAKANA_LETTER_SI 0xff7c
522 #define HALFWIDTH_KATAKANA_LETTER_SU 0xff7d
523 #define HALFWIDTH_KATAKANA_LETTER_SE 0xff7e
524 #define HALFWIDTH_KATAKANA_LETTER_SO 0xff7f
525 
526 #define HALFWIDTH_KATAKANA_LETTER_TA 0xff80
527 #define HALFWIDTH_KATAKANA_LETTER_TI 0xff81
528 #define HALFWIDTH_KATAKANA_LETTER_TU 0xff82
529 #define HALFWIDTH_KATAKANA_LETTER_TE 0xff83
530 #define HALFWIDTH_KATAKANA_LETTER_TO 0xff84
531 
532 #define HALFWIDTH_KATAKANA_LETTER_HA 0xff8a
533 #define HALFWIDTH_KATAKANA_LETTER_HI 0xff8b
534 #define HALFWIDTH_KATAKANA_LETTER_HU 0xff8c
535 #define HALFWIDTH_KATAKANA_LETTER_HE 0xff8d
536 #define HALFWIDTH_KATAKANA_LETTER_HO 0xff8e
537 
538 #define HALFWIDTH_KATAKANA_VOICED_SOUND_MARK      0xff9e
539 #define HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK 0xff9f
540 
541 #define HIRAGANA_LETTER_KA                0x304b
542 #define HIRAGANA_VOICED_SOUND_MARK_OFFSET 1
543 #define HIRAGANA_VOICED_SOUND_MARK_GAP    2
544 
545 #define HIRAGANA_LETTER_HA         0x306f
546 #define HIRAGANA_HA_LINE_BA_OFFSET 1
547 #define HIRAGANA_HA_LINE_PA_OFFSET 2
548 #define HIRAGANA_HA_LINE_GAP       3
549 
550 static grn_bool
normalize_halfwidth_katakana_with_voiced_sound_mark(grn_ctx * ctx,const char * utf8,int * character_length,int rest_length,GNUC_UNUSED uint32_t ** normalize_table,char * normalized,unsigned int * normalized_character_length,unsigned int * normalized_length_in_bytes,unsigned int * normalized_n_characters)551 normalize_halfwidth_katakana_with_voiced_sound_mark(
552   grn_ctx *ctx,
553   const char *utf8,
554   int *character_length,
555   int rest_length,
556   GNUC_UNUSED uint32_t **normalize_table,
557   char *normalized,
558   unsigned int *normalized_character_length,
559   unsigned int *normalized_length_in_bytes,
560   unsigned int *normalized_n_characters)
561 {
562   grn_bool custom_normalized = GRN_FALSE;
563   grn_bool is_voiced_sound_markable_halfwidth_katakana = GRN_FALSE;
564   grn_bool is_semi_voiced_sound_markable_halfwidth_katakana = GRN_FALSE;
565   grn_bool is_ha_line = GRN_FALSE;
566   uint32_t unichar;
567 
568   if (*character_length != 3) {
569     return GRN_FALSE;
570   }
571   if (rest_length < 3) {
572     return GRN_FALSE;
573   }
574 
575   unichar = utf8_to_unichar(utf8, *character_length);
576   if (HALFWIDTH_KATAKANA_LETTER_KA <= unichar &&
577       unichar <= HALFWIDTH_KATAKANA_LETTER_TO) {
578     is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE;
579   } else if (HALFWIDTH_KATAKANA_LETTER_HA <= unichar &&
580              unichar <= HALFWIDTH_KATAKANA_LETTER_HO) {
581     is_voiced_sound_markable_halfwidth_katakana = GRN_TRUE;
582     is_semi_voiced_sound_markable_halfwidth_katakana = GRN_TRUE;
583     is_ha_line = GRN_TRUE;
584   }
585 
586   if (!is_voiced_sound_markable_halfwidth_katakana &&
587       !is_semi_voiced_sound_markable_halfwidth_katakana) {
588     return GRN_FALSE;
589   }
590 
591   {
592     int next_character_length;
593     uint32_t next_unichar;
594     next_character_length = grn_plugin_charlen(ctx,
595                                                utf8 + *character_length,
596                                                rest_length,
597                                                GRN_ENC_UTF8);
598     if (next_character_length != 3) {
599       return GRN_FALSE;
600     }
601     next_unichar = utf8_to_unichar(utf8 + *character_length,
602                                    next_character_length);
603     if (next_unichar == HALFWIDTH_KATAKANA_VOICED_SOUND_MARK) {
604       if (is_voiced_sound_markable_halfwidth_katakana) {
605         unsigned int n_bytes;
606         if (is_ha_line) {
607           n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA +
608                                     HIRAGANA_HA_LINE_BA_OFFSET +
609                                     ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) *
610                                      HIRAGANA_HA_LINE_GAP),
611                                     normalized + *normalized_length_in_bytes);
612         } else {
613           int small_tu_offset = 0;
614           if (HALFWIDTH_KATAKANA_LETTER_TU <= unichar &&
615               unichar <= HALFWIDTH_KATAKANA_LETTER_TO) {
616             small_tu_offset = 1;
617           }
618           n_bytes = unichar_to_utf8(HIRAGANA_LETTER_KA +
619                                     HIRAGANA_VOICED_SOUND_MARK_OFFSET +
620                                     small_tu_offset +
621                                     ((unichar - HALFWIDTH_KATAKANA_LETTER_KA) *
622                                      HIRAGANA_VOICED_SOUND_MARK_GAP),
623                                     normalized + *normalized_length_in_bytes);
624         }
625         *character_length += next_character_length;
626         *normalized_character_length = n_bytes;
627         *normalized_length_in_bytes += n_bytes;
628         (*normalized_n_characters)++;
629         custom_normalized = GRN_TRUE;
630       }
631     } else if (next_unichar == HALFWIDTH_KATAKANA_SEMI_VOICED_SOUND_MARK) {
632       if (is_semi_voiced_sound_markable_halfwidth_katakana) {
633         unsigned int n_bytes;
634         n_bytes = unichar_to_utf8(HIRAGANA_LETTER_HA +
635                                   HIRAGANA_HA_LINE_PA_OFFSET +
636                                   ((unichar - HALFWIDTH_KATAKANA_LETTER_HA) *
637                                    HIRAGANA_HA_LINE_GAP),
638                                   normalized + *normalized_length_in_bytes);
639         *character_length += next_character_length;
640         *normalized_character_length = n_bytes;
641         *normalized_length_in_bytes += n_bytes;
642         (*normalized_n_characters)++;
643         custom_normalized = GRN_TRUE;
644       }
645     }
646   }
647 
648   return custom_normalized;
649 }
650 
651 static grn_obj *
mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)652 mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
653   GNUC_UNUSED grn_ctx *ctx,
654   GNUC_UNUSED int nargs,
655   grn_obj **args,
656   GNUC_UNUSED grn_user_data *user_data)
657 {
658   grn_obj *string = args[0];
659   grn_encoding encoding;
660   const char *normalizer_type_label =
661     "mysql-unicode-ci-except-kana-ci-kana-with-voiced-sound-mark";
662 
663   encoding = grn_string_get_encoding(ctx, string);
664   if (encoding != GRN_ENC_UTF8) {
665     GRN_PLUGIN_ERROR(ctx,
666                      GRN_FUNCTION_NOT_IMPLEMENTED,
667                      "[normalizer][%s] "
668                      "UTF-8 encoding is only supported: %s",
669                      normalizer_type_label,
670                      grn_encoding_to_string(encoding));
671     return NULL;
672   }
673   normalize(ctx, string,
674             normalizer_type_label,
675             unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table,
676             sizeof(unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *),
677             normalize_halfwidth_katakana_with_voiced_sound_mark);
678   return NULL;
679 }
680 
681 static grn_obj *
mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)682 mysql_unicode_520_ci_next(GNUC_UNUSED grn_ctx *ctx,
683                           GNUC_UNUSED int nargs,
684                           grn_obj **args,
685                           GNUC_UNUSED grn_user_data *user_data)
686 {
687   grn_obj *string = args[0];
688   grn_encoding encoding;
689   const char *normalizer_type_label = "mysql-unicode-520-ci";
690 
691   encoding = grn_string_get_encoding(ctx, string);
692   if (encoding != GRN_ENC_UTF8) {
693     GRN_PLUGIN_ERROR(ctx,
694                      GRN_FUNCTION_NOT_IMPLEMENTED,
695                      "[normalizer][%s] "
696                      "UTF-8 encoding is only supported: %s",
697                      normalizer_type_label,
698                      grn_encoding_to_string(encoding));
699     return NULL;
700   }
701   normalize(ctx, string, normalizer_type_label,
702             unicode_520_ci_table,
703             sizeof(unicode_520_ci_table) / sizeof(uint32_t *),
704             NULL);
705   return NULL;
706 }
707 
708 static grn_obj *
mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next(GNUC_UNUSED grn_ctx * ctx,GNUC_UNUSED int nargs,grn_obj ** args,GNUC_UNUSED grn_user_data * user_data)709 mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next(
710   GNUC_UNUSED grn_ctx *ctx,
711   GNUC_UNUSED int nargs,
712   grn_obj **args,
713   GNUC_UNUSED grn_user_data *user_data)
714 {
715   grn_obj *string = args[0];
716   grn_encoding encoding;
717   const char *normalizer_type_label =
718     "mysql-unicode-520-ci-except-kana-ci-kana-with-voiced-sound-mark";
719 
720   encoding = grn_string_get_encoding(ctx, string);
721   if (encoding != GRN_ENC_UTF8) {
722     GRN_PLUGIN_ERROR(ctx,
723                      GRN_FUNCTION_NOT_IMPLEMENTED,
724                      "[normalizer][%s] "
725                      "UTF-8 encoding is only supported: %s",
726                      normalizer_type_label,
727                      grn_encoding_to_string(encoding));
728     return NULL;
729   }
730   normalize(ctx, string,
731             normalizer_type_label,
732             unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table,
733             sizeof(unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_table) / sizeof(uint32_t *),
734             normalize_halfwidth_katakana_with_voiced_sound_mark);
735   return NULL;
736 }
737 
738 grn_rc
GRN_PLUGIN_INIT(grn_ctx * ctx)739 GRN_PLUGIN_INIT(grn_ctx *ctx)
740 {
741   return ctx->rc;
742 }
743 
744 grn_rc
GRN_PLUGIN_REGISTER(grn_ctx * ctx)745 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
746 {
747   grn_normalizer_register(ctx, "NormalizerMySQLGeneralCI", -1,
748                           NULL, mysql_general_ci_next, NULL);
749   grn_normalizer_register(ctx, "NormalizerMySQLUnicodeCI", -1,
750                           NULL, mysql_unicode_ci_next, NULL);
751   grn_normalizer_register(ctx,
752                           "NormalizerMySQLUnicodeCI"
753                           "Except"
754                           "KanaCI"
755                           "KanaWithVoicedSoundMark",
756                           -1,
757                           NULL,
758                           mysql_unicode_ci_except_kana_ci_kana_with_voiced_sound_mark_next,
759                           NULL);
760   grn_normalizer_register(ctx, "NormalizerMySQLUnicode520CI", -1,
761                           NULL, mysql_unicode_520_ci_next, NULL);
762   grn_normalizer_register(ctx,
763                           "NormalizerMySQLUnicode520CI"
764                           "Except"
765                           "KanaCI"
766                           "KanaWithVoicedSoundMark",
767                           -1,
768                           NULL,
769                           mysql_unicode_520_ci_except_kana_ci_kana_with_voiced_sound_mark_next,
770                           NULL);
771   return GRN_SUCCESS;
772 }
773 
774 grn_rc
GRN_PLUGIN_FIN(GNUC_UNUSED grn_ctx * ctx)775 GRN_PLUGIN_FIN(GNUC_UNUSED grn_ctx *ctx)
776 {
777   return GRN_SUCCESS;
778 }
779