1 /* -*- c-basic-offset: 2 -*- */
2 /*
3   Copyright(C) 2012 Brazil
4 
5   This library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License version 2.1 as published by the Free Software Foundation.
8 
9   This library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13 
14   You should have received a copy of the GNU Lesser General Public
15   License along with this library; if not, write to the Free Software
16   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
17 */
18 
19 #include <string.h>
20 
21 #include "grn_normalizer.h"
22 #include "grn_string.h"
23 #include "grn_nfkc.h"
24 #include <groonga/normalizer.h>
25 #include <groonga/tokenizer.h>
26 
27 grn_rc
grn_normalizer_register(grn_ctx * ctx,const char * name_ptr,int name_length,grn_proc_func * init,grn_proc_func * next,grn_proc_func * fin)28 grn_normalizer_register(grn_ctx *ctx,
29                         const char *name_ptr,
30                         int name_length,
31                         grn_proc_func *init,
32                         grn_proc_func *next,
33                         grn_proc_func *fin)
34 {
35   grn_expr_var vars[] = {
36     { NULL, 0 }
37   };
38   GRN_PTR_INIT(&vars[0].value, 0, GRN_ID_NIL);
39 
40   if (name_length < 0) {
41     name_length = strlen(name_ptr);
42   }
43 
44   {
45     grn_obj * const normalizer = grn_proc_create(ctx,
46                                                  name_ptr, name_length,
47                                                  GRN_PROC_NORMALIZER,
48                                                  init, next, fin,
49                                                  sizeof(*vars) / sizeof(vars),
50                                                  vars);
51     if (!normalizer) {
52       GRN_PLUGIN_ERROR(ctx, GRN_NORMALIZER_ERROR,
53                        "[normalizer] failed to register normalizer: <%.*s>",
54                        name_length, name_ptr);
55       return ctx->rc;
56     }
57   }
58   return GRN_SUCCESS;
59 }
60 
61 grn_rc
grn_normalizer_init(void)62 grn_normalizer_init(void)
63 {
64   return GRN_SUCCESS;
65 }
66 
67 grn_rc
grn_normalizer_fin(void)68 grn_normalizer_fin(void)
69 {
70   return GRN_SUCCESS;
71 }
72 
73 static unsigned char symbol[] = {
74   ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
75   0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
76   '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77   '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78   '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
79 };
80 
81 inline static grn_obj *
eucjp_normalize(grn_ctx * ctx,grn_string * nstr)82 eucjp_normalize(grn_ctx *ctx, grn_string *nstr)
83 {
84   static uint16_t hankana[] = {
85     0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
86     0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
87     0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
88     0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
89     0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
90     0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
91     0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
92     0xa1eb
93   };
94   static unsigned char dakuten[] = {
95     0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
96     0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
97     0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
98     0, 0xdc
99   };
100   static unsigned char handaku[] = {
101     0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
102   };
103   int16_t *ch;
104   const unsigned char *s, *s_, *e;
105   unsigned char *d, *d0, *d_, b;
106   uint_least8_t *cp, *ctypes, ctype;
107   size_t size = nstr->original_length_in_bytes, length = 0;
108   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
109   if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
110     ERR(GRN_NO_MEMORY_AVAILABLE,
111         "[string][eucjp] failed to allocate normalized text space");
112     return NULL;
113   }
114   d0 = (unsigned char *) nstr->normalized;
115   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
116     if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
117       GRN_FREE(nstr->normalized);
118       nstr->normalized = NULL;
119       ERR(GRN_NO_MEMORY_AVAILABLE,
120           "[string][eucjp] failed to allocate checks space");
121       return NULL;
122     }
123   }
124   ch = nstr->checks;
125   if (nstr->flags & GRN_STRING_WITH_TYPES) {
126     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
127       GRN_FREE(nstr->checks);
128       GRN_FREE(nstr->normalized);
129       nstr->checks = NULL;
130       nstr->normalized = NULL;
131       ERR(GRN_NO_MEMORY_AVAILABLE,
132           "[string][eucjp] failed to allocate character types space");
133       return NULL;
134     }
135   }
136   cp = ctypes = nstr->ctypes;
137   e = (unsigned char *)nstr->original + size;
138   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
139     if ((*s & 0x80)) {
140       if (((s + 1) < e) && (*(s + 1) & 0x80)) {
141         unsigned char c1 = *s++, c2 = *s, c3 = 0;
142         switch (c1 >> 4) {
143         case 0x08 :
144           if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
145             uint16_t c = hankana[c2 - 0xa0];
146             switch (c) {
147             case 0xa1ab :
148               if (d > d0 + 1 && d[-2] == 0xa5
149                   && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
150                 *(d - 1) = b;
151                 if (ch) { ch[-1] += 2; s_ += 2; }
152                 continue;
153               } else {
154                 *d++ = c >> 8; *d = c & 0xff;
155               }
156               break;
157             case 0xa1eb :
158               if (d > d0 + 1 && d[-2] == 0xa5
159                   && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
160                 *(d - 1) = b;
161                 if (ch) { ch[-1] += 2; s_ += 2; }
162                 continue;
163               } else {
164                 *d++ = c >> 8; *d = c & 0xff;
165               }
166               break;
167             default :
168               *d++ = c >> 8; *d = c & 0xff;
169               break;
170             }
171             ctype = GRN_CHAR_KATAKANA;
172           } else {
173             *d++ = c1; *d = c2;
174             ctype = GRN_CHAR_OTHERS;
175           }
176           break;
177         case 0x09 :
178           *d++ = c1; *d = c2;
179           ctype = GRN_CHAR_OTHERS;
180           break;
181         case 0x0a :
182           switch (c1 & 0x0f) {
183           case 1 :
184             switch (c2) {
185             case 0xbc :
186               *d++ = c1; *d = c2;
187               ctype = GRN_CHAR_KATAKANA;
188               break;
189             case 0xb9 :
190               *d++ = c1; *d = c2;
191               ctype = GRN_CHAR_KANJI;
192               break;
193             case 0xa1 :
194               if (removeblankp) {
195                 if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
196                 continue;
197               } else {
198                 *d = ' ';
199                 ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
200               }
201               break;
202             default :
203               if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
204                 *d = c3;
205                 ctype = GRN_CHAR_SYMBOL;
206               } else {
207                 *d++ = c1; *d = c2;
208                 ctype = GRN_CHAR_OTHERS;
209               }
210               break;
211             }
212             break;
213           case 2 :
214             *d++ = c1; *d = c2;
215             ctype = GRN_CHAR_SYMBOL;
216             break;
217           case 3 :
218             c3 = c2 - 0x80;
219             if ('a' <= c3 && c3 <= 'z') {
220               ctype = GRN_CHAR_ALPHA;
221               *d = c3;
222             } else if ('A' <= c3 && c3 <= 'Z') {
223               ctype = GRN_CHAR_ALPHA;
224               *d = c3 + 0x20;
225             } else if ('0' <= c3 && c3 <= '9') {
226               ctype = GRN_CHAR_DIGIT;
227               *d = c3;
228             } else {
229               ctype = GRN_CHAR_OTHERS;
230               *d++ = c1; *d = c2;
231             }
232             break;
233           case 4 :
234             *d++ = c1; *d = c2;
235             ctype = GRN_CHAR_HIRAGANA;
236             break;
237           case 5 :
238             *d++ = c1; *d = c2;
239             ctype = GRN_CHAR_KATAKANA;
240             break;
241           case 6 :
242           case 7 :
243           case 8 :
244             *d++ = c1; *d = c2;
245             ctype = GRN_CHAR_SYMBOL;
246             break;
247           default :
248             *d++ = c1; *d = c2;
249             ctype = GRN_CHAR_OTHERS;
250             break;
251           }
252           break;
253         default :
254           *d++ = c1; *d = c2;
255           ctype = GRN_CHAR_KANJI;
256           break;
257         }
258       } else {
259         /* skip invalid character */
260         continue;
261       }
262     } else {
263       unsigned char c = *s;
264       switch (c >> 4) {
265       case 0 :
266       case 1 :
267         /* skip unprintable ascii */
268         if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
269         continue;
270       case 2 :
271         if (c == 0x20) {
272           if (removeblankp) {
273             if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
274             continue;
275           } else {
276             *d = ' ';
277             ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
278           }
279         } else {
280           *d = c;
281           ctype = GRN_CHAR_SYMBOL;
282         }
283         break;
284       case 3 :
285         *d = c;
286         ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
287         break;
288       case 4 :
289         *d = ('A' <= c) ? c + 0x20 : c;
290         ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
291         break;
292       case 5 :
293         *d = (c <= 'Z') ? c + 0x20 : c;
294         ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
295         break;
296       case 6 :
297         *d = c;
298         ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
299         break;
300       case 7 :
301         *d = c;
302         ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
303         break;
304       default :
305         *d = c;
306         ctype = GRN_CHAR_OTHERS;
307         break;
308       }
309     }
310     d++;
311     length++;
312     if (cp) { *cp++ = ctype; }
313     if (ch) {
314       *ch++ = (int16_t)(s + 1 - s_);
315       s_ = s + 1;
316       while (++d_ < d) { *ch++ = 0; }
317     }
318   }
319   if (cp) { *cp = GRN_CHAR_NULL; }
320   *d = '\0';
321   nstr->n_characters = length;
322   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
323   return NULL;
324 }
325 
326 inline static grn_obj *
sjis_normalize(grn_ctx * ctx,grn_string * nstr)327 sjis_normalize(grn_ctx *ctx, grn_string *nstr)
328 {
329   static uint16_t hankana[] = {
330     0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
331     0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
332     0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
333     0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
334     0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
335     0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
336     0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
337     0x814b
338   };
339   static unsigned char dakuten[] = {
340     0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
341     0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
342     0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
343     0, 0x7b
344   };
345   static unsigned char handaku[] = {
346     0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
347   };
348   int16_t *ch;
349   const unsigned char *s, *s_;
350   unsigned char *d, *d0, *d_, b, *e;
351   uint_least8_t *cp, *ctypes, ctype;
352   size_t size = nstr->original_length_in_bytes, length = 0;
353   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
354   if (!(nstr->normalized = GRN_MALLOC(size * 2 + 1))) {
355     ERR(GRN_NO_MEMORY_AVAILABLE,
356         "[string][sjis] failed to allocate normalized text space");
357     return NULL;
358   }
359   d0 = (unsigned char *) nstr->normalized;
360   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
361     if (!(nstr->checks = GRN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
362       GRN_FREE(nstr->normalized);
363       nstr->normalized = NULL;
364       ERR(GRN_NO_MEMORY_AVAILABLE,
365           "[string][sjis] failed to allocate checks space");
366       return NULL;
367     }
368   }
369   ch = nstr->checks;
370   if (nstr->flags & GRN_STRING_WITH_TYPES) {
371     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
372       GRN_FREE(nstr->checks);
373       GRN_FREE(nstr->normalized);
374       nstr->checks = NULL;
375       nstr->normalized = NULL;
376       ERR(GRN_NO_MEMORY_AVAILABLE,
377           "[string][sjis] failed to allocate character types space");
378       return NULL;
379     }
380   }
381   cp = ctypes = nstr->ctypes;
382   e = (unsigned char *)nstr->original + size;
383   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
384     if ((*s & 0x80)) {
385       if (0xa0 <= *s && *s <= 0xdf) {
386         uint16_t c = hankana[*s - 0xa0];
387         switch (c) {
388         case 0x814a :
389           if (d > d0 + 1 && d[-2] == 0x83
390               && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
391             *(d - 1) = b;
392             if (ch) { ch[-1]++; s_++; }
393             continue;
394           } else {
395             *d++ = c >> 8; *d = c & 0xff;
396           }
397           break;
398         case 0x814b :
399           if (d > d0 + 1 && d[-2] == 0x83
400               && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
401             *(d - 1) = b;
402             if (ch) { ch[-1]++; s_++; }
403             continue;
404           } else {
405             *d++ = c >> 8; *d = c & 0xff;
406           }
407           break;
408         default :
409           *d++ = c >> 8; *d = c & 0xff;
410           break;
411         }
412         ctype = GRN_CHAR_KATAKANA;
413       } else {
414         if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
415           unsigned char c1 = *s++, c2 = *s, c3 = 0;
416           if (0x81 <= c1 && c1 <= 0x87) {
417             switch (c1 & 0x0f) {
418             case 1 :
419               switch (c2) {
420               case 0x5b :
421                 *d++ = c1; *d = c2;
422                 ctype = GRN_CHAR_KATAKANA;
423                 break;
424               case 0x58 :
425                 *d++ = c1; *d = c2;
426                 ctype = GRN_CHAR_KANJI;
427                 break;
428               case 0x40 :
429                 if (removeblankp) {
430                   if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
431                   continue;
432                 } else {
433                   *d = ' ';
434                   ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
435                 }
436                 break;
437               default :
438                 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
439                   *d = c3;
440                   ctype = GRN_CHAR_SYMBOL;
441                 } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
442                   *d = c3;
443                   ctype = GRN_CHAR_SYMBOL;
444                 } else {
445                   *d++ = c1; *d = c2;
446                   ctype = GRN_CHAR_OTHERS;
447                 }
448                 break;
449               }
450               break;
451             case 2 :
452               c3 = c2 - 0x1f;
453               if (0x4f <= c2 && c2 <= 0x58) {
454                 ctype = GRN_CHAR_DIGIT;
455                 *d = c2 - 0x1f;
456               } else if (0x60 <= c2 && c2 <= 0x79) {
457                 ctype = GRN_CHAR_ALPHA;
458                 *d = c2 + 0x01;
459               } else if (0x81 <= c2 && c2 <= 0x9a) {
460                 ctype = GRN_CHAR_ALPHA;
461                 *d = c2 - 0x20;
462               } else if (0x9f <= c2 && c2 <= 0xf1) {
463                 *d++ = c1; *d = c2;
464                 ctype = GRN_CHAR_HIRAGANA;
465               } else {
466                 *d++ = c1; *d = c2;
467                 ctype = GRN_CHAR_OTHERS;
468               }
469               break;
470             case 3 :
471               if (0x40 <= c2 && c2 <= 0x96) {
472                 *d++ = c1; *d = c2;
473                 ctype = GRN_CHAR_KATAKANA;
474               } else {
475                 *d++ = c1; *d = c2;
476                 ctype = GRN_CHAR_SYMBOL;
477               }
478               break;
479             case 4 :
480             case 7 :
481               *d++ = c1; *d = c2;
482               ctype = GRN_CHAR_SYMBOL;
483               break;
484             default :
485               *d++ = c1; *d = c2;
486               ctype = GRN_CHAR_OTHERS;
487               break;
488             }
489           } else {
490             *d++ = c1; *d = c2;
491             ctype = GRN_CHAR_KANJI;
492           }
493         } else {
494           /* skip invalid character */
495           continue;
496         }
497       }
498     } else {
499       unsigned char c = *s;
500       switch (c >> 4) {
501       case 0 :
502       case 1 :
503         /* skip unprintable ascii */
504         if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
505         continue;
506       case 2 :
507         if (c == 0x20) {
508           if (removeblankp) {
509             if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
510             continue;
511           } else {
512             *d = ' ';
513             ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
514           }
515         } else {
516           *d = c;
517           ctype = GRN_CHAR_SYMBOL;
518         }
519         break;
520       case 3 :
521         *d = c;
522         ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
523         break;
524       case 4 :
525         *d = ('A' <= c) ? c + 0x20 : c;
526         ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
527         break;
528       case 5 :
529         *d = (c <= 'Z') ? c + 0x20 : c;
530         ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
531         break;
532       case 6 :
533         *d = c;
534         ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
535         break;
536       case 7 :
537         *d = c;
538         ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
539         break;
540       default :
541         *d = c;
542         ctype = GRN_CHAR_OTHERS;
543         break;
544       }
545     }
546     d++;
547     length++;
548     if (cp) { *cp++ = ctype; }
549     if (ch) {
550       *ch++ = (int16_t)(s + 1 - s_);
551       s_ = s + 1;
552       while (++d_ < d) { *ch++ = 0; }
553     }
554   }
555   if (cp) { *cp = GRN_CHAR_NULL; }
556   *d = '\0';
557   nstr->n_characters = length;
558   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
559   return NULL;
560 }
561 
562 #ifdef GRN_WITH_NFKC
563 static inline int
grn_str_charlen_utf8(grn_ctx * ctx,const unsigned char * str,const unsigned char * end)564 grn_str_charlen_utf8(grn_ctx *ctx, const unsigned char *str, const unsigned char *end)
565 {
566   /* MEMO: This function allows non-null-terminated string as str. */
567   /*       But requires the end of string. */
568   const unsigned char *p = str;
569   if (end <= p || !*p) { return 0; }
570   if (*p & 0x80) {
571     int b, w;
572     int size;
573     int i;
574     for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
575     if (!w) {
576       GRN_LOG(ctx, GRN_LOG_WARNING,
577               "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>",
578               (int)(end - p), p,
579               (int)(end - str), str);
580       return 0;
581     }
582     size = w + 1;
583     for (i = 1; i < size; i++) {
584       if (++p >= end) {
585         GRN_LOG(ctx, GRN_LOG_WARNING,
586                 "invalid utf8 string: too short: "
587                 "%d byte is required but %d byte is given: <%.*s>",
588                 size, i,
589                 (int)(end - str), str);
590         return 0;
591       }
592       if (!*p) {
593         GRN_LOG(ctx, GRN_LOG_WARNING,
594                 "invalid utf8 string: NULL character is found: <%.*s>",
595                 (int)(end - str), str);
596         return 0;
597       }
598       if ((*p & 0xc0) != 0x80) {
599         GRN_LOG(ctx, GRN_LOG_WARNING,
600                 "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>",
601                 (int)(end - p), p,
602                 (int)(end - str), str);
603         return 0;
604       }
605     }
606     return size;
607   } else {
608     return 1;
609   }
610   return 0;
611 }
612 
613 inline static grn_obj *
utf8_normalize(grn_ctx * ctx,grn_string * nstr)614 utf8_normalize(grn_ctx *ctx, grn_string *nstr)
615 {
616   int16_t *ch;
617   const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
618   unsigned char *d, *d_, *de;
619   uint_least8_t *cp;
620   size_t length = 0, ls, lp, size = nstr->original_length_in_bytes, ds = size * 3;
621   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
622   grn_bool remove_tokenized_delimiter_p =
623     nstr->flags & GRN_STRING_REMOVE_TOKENIZED_DELIMITER;
624   if (!(nstr->normalized = GRN_MALLOC(ds + 1))) {
625     ERR(GRN_NO_MEMORY_AVAILABLE,
626         "[string][utf8] failed to allocate normalized text space");
627     return NULL;
628   }
629   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
630     if (!(nstr->checks = GRN_MALLOC(ds * sizeof(int16_t) + 1))) {
631       GRN_FREE(nstr->normalized);
632       nstr->normalized = NULL;
633       ERR(GRN_NO_MEMORY_AVAILABLE,
634           "[string][utf8] failed to allocate checks space");
635       return NULL;
636     }
637   }
638   ch = nstr->checks;
639   if (nstr->flags & GRN_STRING_WITH_TYPES) {
640     if (!(nstr->ctypes = GRN_MALLOC(ds + 1))) {
641       if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
642       GRN_FREE(nstr->normalized); nstr->normalized = NULL;
643       ERR(GRN_NO_MEMORY_AVAILABLE,
644           "[string][utf8] failed to allocate character types space");
645       return NULL;
646     }
647   }
648   cp = nstr->ctypes;
649   d = (unsigned char *)nstr->normalized;
650   de = d + ds;
651   d_ = NULL;
652   e = (unsigned char *)nstr->original + size;
653   for (s = s_ = (unsigned char *)nstr->original; ; s += ls) {
654     if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
655       break;
656     }
657     if (remove_tokenized_delimiter_p &&
658         grn_tokenizer_is_tokenized_delimiter(ctx, (const char *)s, ls,
659                                              GRN_ENC_UTF8)) {
660       continue;
661     }
662     if ((p = (unsigned char *)grn_nfkc_decompose(s))) {
663       pe = p + strlen((char *)p);
664     } else {
665       p = s;
666       pe = p + ls;
667     }
668     if (d_ && (p2 = (unsigned char *)grn_nfkc_compose(d_, p))) {
669       p = p2;
670       pe = p + strlen((char *)p);
671       if (cp) { cp--; }
672       if (ch) {
673         ch -= (d - d_);
674         if (ch[0] >= 0) {
675           s_ = s__;
676         }
677       }
678       d = d_;
679       length--;
680     }
681     for (; ; p += lp) {
682       if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
683         break;
684       }
685       if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
686         if (cp > nstr->ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
687       } else {
688         if (de <= d + lp) {
689           unsigned char *normalized;
690           ds += (ds >> 1) + lp;
691           if (!(normalized = GRN_REALLOC(nstr->normalized, ds + 1))) {
692             if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
693             if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
694             GRN_FREE(nstr->normalized); nstr->normalized = NULL;
695             ERR(GRN_NO_MEMORY_AVAILABLE,
696                 "[string][utf8] failed to expand normalized text space");
697             return NULL;
698           }
699           de = normalized + ds;
700           d = normalized + (d - (unsigned char *)nstr->normalized);
701           nstr->normalized = (char *)normalized;
702           if (ch) {
703             int16_t *checks;
704             if (!(checks = GRN_REALLOC(nstr->checks, ds * sizeof(int16_t) + 1))) {
705               if (nstr->ctypes) { GRN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
706               GRN_FREE(nstr->checks); nstr->checks = NULL;
707               GRN_FREE(nstr->normalized); nstr->normalized = NULL;
708               ERR(GRN_NO_MEMORY_AVAILABLE,
709                   "[string][utf8] failed to expand checks space");
710               return NULL;
711             }
712             ch = checks + (ch - nstr->checks);
713             nstr->checks = checks;
714           }
715           if (cp) {
716             uint_least8_t *ctypes;
717             if (!(ctypes = GRN_REALLOC(nstr->ctypes, ds + 1))) {
718               GRN_FREE(nstr->ctypes); nstr->ctypes = NULL;
719               if (nstr->checks) { GRN_FREE(nstr->checks); nstr->checks = NULL; }
720               GRN_FREE(nstr->normalized); nstr->normalized = NULL;
721               ERR(GRN_NO_MEMORY_AVAILABLE,
722                   "[string][utf8] failed to expand character types space");
723               return NULL;
724             }
725             cp = ctypes + (cp - nstr->ctypes);
726             nstr->ctypes = ctypes;
727           }
728         }
729         grn_memcpy(d, p, lp);
730         d_ = d;
731         d += lp;
732         length++;
733         if (cp) { *cp++ = grn_nfkc_char_type(p); }
734         if (ch) {
735           size_t i;
736           if (s_ == s + ls) {
737             *ch++ = -1;
738           } else {
739             *ch++ = (int16_t)(s + ls - s_);
740             s__ = s_;
741             s_ = s + ls;
742           }
743           for (i = lp; i > 1; i--) { *ch++ = 0; }
744         }
745       }
746     }
747   }
748   if (cp) { *cp = GRN_CHAR_NULL; }
749   *d = '\0';
750   nstr->n_characters = length;
751   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
752   return NULL;
753 }
754 #endif /* GRN_WITH_NFKC */
755 
756 inline static grn_obj *
ascii_normalize(grn_ctx * ctx,grn_string * nstr)757 ascii_normalize(grn_ctx *ctx, grn_string *nstr)
758 {
759   int16_t *ch;
760   const unsigned char *s, *s_, *e;
761   unsigned char *d, *d0, *d_;
762   uint_least8_t *cp, *ctypes, ctype;
763   size_t size = nstr->original_length_in_bytes, length = 0;
764   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
765   if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
766     ERR(GRN_NO_MEMORY_AVAILABLE,
767         "[string][ascii] failed to allocate normalized text space");
768     return NULL;
769   }
770   d0 = (unsigned char *) nstr->normalized;
771   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
772     if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
773       GRN_FREE(nstr->normalized);
774       nstr->normalized = NULL;
775       ERR(GRN_NO_MEMORY_AVAILABLE,
776           "[string][ascii] failed to allocate checks space");
777       return NULL;
778     }
779   }
780   ch = nstr->checks;
781   if (nstr->flags & GRN_STRING_WITH_TYPES) {
782     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
783       GRN_FREE(nstr->checks);
784       GRN_FREE(nstr->normalized);
785       nstr->checks = NULL;
786       nstr->normalized = NULL;
787       ERR(GRN_NO_MEMORY_AVAILABLE,
788           "[string][ascii] failed to allocate character types space");
789       return NULL;
790     }
791   }
792   cp = ctypes = nstr->ctypes;
793   e = (unsigned char *)nstr->original + size;
794   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
795     unsigned char c = *s;
796     switch (c >> 4) {
797     case 0 :
798     case 1 :
799       /* skip unprintable ascii */
800       if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
801       continue;
802     case 2 :
803       if (c == 0x20) {
804         if (removeblankp) {
805           if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
806           continue;
807         } else {
808           *d = ' ';
809           ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
810         }
811       } else {
812         *d = c;
813         ctype = GRN_CHAR_SYMBOL;
814       }
815       break;
816     case 3 :
817       *d = c;
818       ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
819       break;
820     case 4 :
821       *d = ('A' <= c) ? c + 0x20 : c;
822       ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
823       break;
824     case 5 :
825       *d = (c <= 'Z') ? c + 0x20 : c;
826       ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
827       break;
828     case 6 :
829       *d = c;
830       ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
831       break;
832     case 7 :
833       *d = c;
834       ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
835       break;
836     default :
837       *d = c;
838       ctype = GRN_CHAR_OTHERS;
839       break;
840     }
841     d++;
842     length++;
843     if (cp) { *cp++ = ctype; }
844     if (ch) {
845       *ch++ = (int16_t)(s + 1 - s_);
846       s_ = s + 1;
847       while (++d_ < d) { *ch++ = 0; }
848     }
849   }
850   if (cp) { *cp = GRN_CHAR_NULL; }
851   *d = '\0';
852   nstr->n_characters = length;
853   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
854   return NULL;
855 }
856 
857 /* use cp1252 as latin1 */
858 inline static grn_obj *
latin1_normalize(grn_ctx * ctx,grn_string * nstr)859 latin1_normalize(grn_ctx *ctx, grn_string *nstr)
860 {
861   int16_t *ch;
862   const unsigned char *s, *s_, *e;
863   unsigned char *d, *d0, *d_;
864   uint_least8_t *cp, *ctypes, ctype;
865   size_t size = nstr->original_length_in_bytes, length = 0;
866   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
867   if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
868     ERR(GRN_NO_MEMORY_AVAILABLE,
869         "[string][latin1] failed to allocate normalized text space");
870     return NULL;
871   }
872   d0 = (unsigned char *) nstr->normalized;
873   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
874     if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
875       GRN_FREE(nstr->normalized);
876       nstr->normalized = NULL;
877       ERR(GRN_NO_MEMORY_AVAILABLE,
878           "[string][latin1] failed to allocate checks space");
879       return NULL;
880     }
881   }
882   ch = nstr->checks;
883   if (nstr->flags & GRN_STRING_WITH_TYPES) {
884     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
885       GRN_FREE(nstr->checks);
886       GRN_FREE(nstr->normalized);
887       nstr->checks = NULL;
888       nstr->normalized = NULL;
889       ERR(GRN_NO_MEMORY_AVAILABLE,
890           "[normalizer][latin1] failed to allocate character types space");
891       return NULL;
892     }
893   }
894   cp = ctypes = nstr->ctypes;
895   e = (unsigned char *)nstr->original + size;
896   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
897     unsigned char c = *s;
898     switch (c >> 4) {
899     case 0 :
900     case 1 :
901       /* skip unprintable ascii */
902       if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
903       continue;
904     case 2 :
905       if (c == 0x20) {
906         if (removeblankp) {
907           if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
908           continue;
909         } else {
910           *d = ' ';
911           ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
912         }
913       } else {
914         *d = c;
915         ctype = GRN_CHAR_SYMBOL;
916       }
917       break;
918     case 3 :
919       *d = c;
920       ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
921       break;
922     case 4 :
923       *d = ('A' <= c) ? c + 0x20 : c;
924       ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
925       break;
926     case 5 :
927       *d = (c <= 'Z') ? c + 0x20 : c;
928       ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
929       break;
930     case 6 :
931       *d = c;
932       ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
933       break;
934     case 7 :
935       *d = c;
936       ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
937       break;
938     case 8 :
939       if (c == 0x8a || c == 0x8c || c == 0x8e) {
940         *d = c + 0x10;
941         ctype = GRN_CHAR_ALPHA;
942       } else {
943         *d = c;
944         ctype = GRN_CHAR_SYMBOL;
945       }
946       break;
947     case 9 :
948       if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
949         *d = (c == 0x9f) ? c + 0x60 : c;
950         ctype = GRN_CHAR_ALPHA;
951       } else {
952         *d = c;
953         ctype = GRN_CHAR_SYMBOL;
954       }
955       break;
956     case 0x0c :
957       *d = c + 0x20;
958       ctype = GRN_CHAR_ALPHA;
959       break;
960     case 0x0d :
961       *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
962       ctype = (c == 0xd7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
963       break;
964     case 0x0e :
965       *d = c;
966       ctype = GRN_CHAR_ALPHA;
967       break;
968     case 0x0f :
969       *d = c;
970       ctype = (c == 0xf7) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
971       break;
972     default :
973       *d = c;
974       ctype = GRN_CHAR_OTHERS;
975       break;
976     }
977     d++;
978     length++;
979     if (cp) { *cp++ = ctype; }
980     if (ch) {
981       *ch++ = (int16_t)(s + 1 - s_);
982       s_ = s + 1;
983       while (++d_ < d) { *ch++ = 0; }
984     }
985   }
986   if (cp) { *cp = GRN_CHAR_NULL; }
987   *d = '\0';
988   nstr->n_characters = length;
989   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
990   return NULL;
991 }
992 
993 inline static grn_obj *
koi8r_normalize(grn_ctx * ctx,grn_string * nstr)994 koi8r_normalize(grn_ctx *ctx, grn_string *nstr)
995 {
996   int16_t *ch;
997   const unsigned char *s, *s_, *e;
998   unsigned char *d, *d0, *d_;
999   uint_least8_t *cp, *ctypes, ctype;
1000   size_t size = nstr->original_length_in_bytes, length = 0;
1001   int removeblankp = nstr->flags & GRN_STRING_REMOVE_BLANK;
1002   if (!(nstr->normalized = GRN_MALLOC(size + 1))) {
1003     ERR(GRN_NO_MEMORY_AVAILABLE,
1004         "[string][koi8r] failed to allocate normalized text space");
1005     return NULL;
1006   }
1007   d0 = (unsigned char *) nstr->normalized;
1008   if (nstr->flags & GRN_STRING_WITH_CHECKS) {
1009     if (!(nstr->checks = GRN_MALLOC(size * sizeof(int16_t) + 1))) {
1010       GRN_FREE(nstr->normalized);
1011       nstr->normalized = NULL;
1012       ERR(GRN_NO_MEMORY_AVAILABLE,
1013           "[string][koi8r] failed to allocate checks space");
1014       return NULL;
1015     }
1016   }
1017   ch = nstr->checks;
1018   if (nstr->flags & GRN_STRING_WITH_TYPES) {
1019     if (!(nstr->ctypes = GRN_MALLOC(size + 1))) {
1020       GRN_FREE(nstr->checks);
1021       GRN_FREE(nstr->normalized);
1022       nstr->checks = NULL;
1023       nstr->normalized = NULL;
1024       ERR(GRN_NO_MEMORY_AVAILABLE,
1025           "[string][koi8r] failed to allocate character types space");
1026       return NULL;
1027     }
1028   }
1029   cp = ctypes = nstr->ctypes;
1030   e = (unsigned char *)nstr->original + size;
1031   for (s = s_ = (unsigned char *) nstr->original, d = d_ = d0; s < e; s++) {
1032     unsigned char c = *s;
1033     switch (c >> 4) {
1034     case 0 :
1035     case 1 :
1036       /* skip unprintable ascii */
1037       if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
1038       continue;
1039     case 2 :
1040       if (c == 0x20) {
1041         if (removeblankp) {
1042           if (cp > ctypes) { *(cp - 1) |= GRN_CHAR_BLANK; }
1043           continue;
1044         } else {
1045           *d = ' ';
1046           ctype = GRN_CHAR_BLANK|GRN_CHAR_SYMBOL;
1047         }
1048       } else {
1049         *d = c;
1050         ctype = GRN_CHAR_SYMBOL;
1051       }
1052       break;
1053     case 3 :
1054       *d = c;
1055       ctype = (c <= 0x39) ? GRN_CHAR_DIGIT : GRN_CHAR_SYMBOL;
1056       break;
1057     case 4 :
1058       *d = ('A' <= c) ? c + 0x20 : c;
1059       ctype = (c == 0x40) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
1060       break;
1061     case 5 :
1062       *d = (c <= 'Z') ? c + 0x20 : c;
1063       ctype = (c <= 0x5a) ? GRN_CHAR_ALPHA : GRN_CHAR_SYMBOL;
1064       break;
1065     case 6 :
1066       *d = c;
1067       ctype = (c == 0x60) ? GRN_CHAR_SYMBOL : GRN_CHAR_ALPHA;
1068       break;
1069     case 7 :
1070       *d = c;
1071       ctype = (c <= 0x7a) ? GRN_CHAR_ALPHA : (c == 0x7f ? GRN_CHAR_OTHERS : GRN_CHAR_SYMBOL);
1072       break;
1073     case 0x0a :
1074       *d = c;
1075       ctype = (c == 0xa3) ? GRN_CHAR_ALPHA : GRN_CHAR_OTHERS;
1076       break;
1077     case 0x0b :
1078       if (c == 0xb3) {
1079         *d = c - 0x10;
1080         ctype = GRN_CHAR_ALPHA;
1081       } else {
1082         *d = c;
1083         ctype = GRN_CHAR_OTHERS;
1084       }
1085       break;
1086     case 0x0c :
1087     case 0x0d :
1088       *d = c;
1089       ctype = GRN_CHAR_ALPHA;
1090       break;
1091     case 0x0e :
1092     case 0x0f :
1093       *d = c - 0x20;
1094       ctype = GRN_CHAR_ALPHA;
1095       break;
1096     default :
1097       *d = c;
1098       ctype = GRN_CHAR_OTHERS;
1099       break;
1100     }
1101     d++;
1102     length++;
1103     if (cp) { *cp++ = ctype; }
1104     if (ch) {
1105       *ch++ = (int16_t)(s + 1 - s_);
1106       s_ = s + 1;
1107       while (++d_ < d) { *ch++ = 0; }
1108     }
1109   }
1110   if (cp) { *cp = GRN_CHAR_NULL; }
1111   *d = '\0';
1112   nstr->n_characters = length;
1113   nstr->normalized_length_in_bytes = (size_t)(d - (unsigned char *)nstr->normalized);
1114   return NULL;
1115 }
1116 
1117 static grn_obj *
auto_next(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)1118 auto_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
1119 {
1120   grn_string *string = (grn_string *)(args[0]);
1121   switch (string->encoding) {
1122   case GRN_ENC_EUC_JP :
1123     eucjp_normalize(ctx, string);
1124     break;
1125   case GRN_ENC_UTF8 :
1126 #ifdef GRN_WITH_NFKC
1127     utf8_normalize(ctx, string);
1128 #else /* GRN_WITH_NFKC */
1129     ascii_normalize(ctx, string);
1130 #endif /* GRN_WITH_NFKC */
1131     break;
1132   case GRN_ENC_SJIS :
1133     sjis_normalize(ctx, string);
1134     break;
1135   case GRN_ENC_LATIN1 :
1136     latin1_normalize(ctx, string);
1137     break;
1138   case GRN_ENC_KOI8R :
1139     koi8r_normalize(ctx, string);
1140     break;
1141   default :
1142     ascii_normalize(ctx, string);
1143     break;
1144   }
1145   return NULL;
1146 }
1147 
1148 #ifdef GRN_WITH_NFKC
1149 static grn_obj *
nfkc51_next(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)1150 nfkc51_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
1151 {
1152   grn_string *string = (grn_string *)(args[0]);
1153   utf8_normalize(ctx, string);
1154   return NULL;
1155 }
1156 #endif /* GRN_WITH_NFKC */
1157 
1158 grn_rc
grn_normalizer_normalize(grn_ctx * ctx,grn_obj * normalizer,grn_obj * string)1159 grn_normalizer_normalize(grn_ctx *ctx, grn_obj *normalizer, grn_obj *string)
1160 {
1161   grn_rc rc;
1162   int nargs = 0;
1163 
1164   grn_ctx_push(ctx, string);
1165   nargs++;
1166   rc = grn_proc_call(ctx, normalizer, nargs, NULL);
1167   grn_ctx_pop(ctx);
1168 
1169   return rc;
1170 }
1171 
1172 grn_rc
grn_db_init_builtin_normalizers(grn_ctx * ctx)1173 grn_db_init_builtin_normalizers(grn_ctx *ctx)
1174 {
1175   const char *normalizer_nfkc51_name = "NormalizerNFKC51";
1176 
1177   grn_normalizer_register(ctx, GRN_NORMALIZER_AUTO_NAME, -1,
1178                           NULL, auto_next, NULL);
1179 
1180 #ifdef GRN_WITH_NFKC
1181   grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
1182                           NULL, nfkc51_next, NULL);
1183 #else /* GRN_WITH_NFKC */
1184   grn_normalizer_register(ctx, normalizer_nfkc51_name, -1,
1185                           NULL, NULL, NULL);
1186 #endif /* GRN_WITH_NFKC */
1187 /*
1188   grn_normalizer_register(ctx, "NormalizerUCA", -1,
1189                           NULL, uca_next, NULL);
1190 */
1191 
1192   return GRN_SUCCESS;
1193 }
1194