1 /* -*- c-basic-offset: 2 -*- */
2 /*
3   Copyright(C) 2009-2016 Brazil
4 
5   This library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License version 2.1 as published by the Free Software Foundation.
8 
9   This library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13 
14   You should have received a copy of the GNU Lesser General Public
15   License along with this library; if not, write to the Free Software
16   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
17 */
18 
19 #ifdef GRN_EMBEDDED
20 #  define GRN_PLUGIN_FUNCTION_TAG tokenizers_mecab
21 #endif
22 
23 #include <grn_str.h>
24 
25 #include <groonga.h>
26 #include <groonga/tokenizer.h>
27 
28 #include <mecab.h>
29 
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33 
34 static unsigned int sole_mecab_init_counter = 0;
35 static mecab_t *sole_mecab = NULL;
36 static grn_plugin_mutex *sole_mecab_mutex = NULL;
37 static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
38 
39 static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
40 static int grn_mecab_chunk_size_threshold = 8192;
41 
42 typedef struct {
43   mecab_t *mecab;
44   grn_obj buf;
45   const char *next;
46   const char *end;
47   grn_tokenizer_query *query;
48   grn_tokenizer_token token;
49 } grn_mecab_tokenizer;
50 
51 static const char *
mecab_global_error_message(void)52 mecab_global_error_message(void)
53 {
54   double version;
55 
56   version = atof(mecab_version());
57   /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */
58   if (version <= 0.993) {
59     return "Unknown";
60   }
61 
62   return mecab_strerror(NULL);
63 }
64 
65 
66 static grn_encoding
translate_mecab_charset_to_grn_encoding(const char * charset)67 translate_mecab_charset_to_grn_encoding(const char *charset)
68 {
69   if (grn_strcasecmp(charset, "euc-jp") == 0) {
70     return GRN_ENC_EUC_JP;
71   } else if (grn_strcasecmp(charset, "utf-8") == 0 ||
72              grn_strcasecmp(charset, "utf8") == 0) {
73     return GRN_ENC_UTF8;
74   } else if (grn_strcasecmp(charset, "shift_jis") == 0 ||
75              grn_strcasecmp(charset, "shift-jis") == 0 ||
76              grn_strcasecmp(charset, "sjis") == 0) {
77     return GRN_ENC_SJIS;
78   }
79   return GRN_ENC_NONE;
80 }
81 
82 static grn_encoding
get_mecab_encoding(mecab_t * mecab)83 get_mecab_encoding(mecab_t *mecab)
84 {
85   grn_encoding encoding = GRN_ENC_NONE;
86   const mecab_dictionary_info_t *dictionary_info;
87   dictionary_info = mecab_dictionary_info(mecab);
88   if (dictionary_info) {
89     const char *charset = dictionary_info->charset;
90     encoding = translate_mecab_charset_to_grn_encoding(charset);
91   }
92   return encoding;
93 }
94 
95 static inline grn_bool
is_delimiter_character(grn_ctx * ctx,const char * character,int character_bytes)96 is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes)
97 {
98   switch (character_bytes) {
99   case 1 :
100     switch (character[0]) {
101     case ',' :
102     case '.' :
103     case '!' :
104     case '?' :
105       return GRN_TRUE;
106     default :
107       return GRN_FALSE;
108     }
109   case 3 :
110     switch ((unsigned char)(character[0])) {
111     case 0xE3 :
112       switch ((unsigned char)(character[1])) {
113       case 0x80 :
114         switch ((unsigned char)(character[2])) {
115         case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */
116         case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */
117           return GRN_TRUE;
118         default :
119           return GRN_FALSE;
120         }
121       default :
122         return GRN_FALSE;
123       }
124       return GRN_FALSE;
125     case 0xEF :
126       switch ((unsigned char)(character[1])) {
127       case 0xBC :
128         switch ((unsigned char)(character[2])) {
129         case 0x81 :
130           /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */
131         case 0x9F :
132           /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */
133           return GRN_TRUE;
134         default :
135           return GRN_FALSE;
136         }
137       default :
138         return GRN_FALSE;
139       }
140       return GRN_FALSE;
141     default :
142       return GRN_FALSE;
143     }
144   default :
145     return GRN_FALSE;
146   }
147 }
148 
149 static grn_bool
chunked_tokenize_utf8_chunk(grn_ctx * ctx,grn_mecab_tokenizer * tokenizer,const char * chunk,unsigned int chunk_bytes)150 chunked_tokenize_utf8_chunk(grn_ctx *ctx,
151                             grn_mecab_tokenizer *tokenizer,
152                             const char *chunk,
153                             unsigned int chunk_bytes)
154 {
155   const char *tokenized_chunk;
156   size_t tokenized_chunk_length;
157 
158   tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
159   if (!tokenized_chunk) {
160     GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
161                      "[tokenizer][mecab][chunk] "
162                      "mecab_sparse_tostr2() failed len=%d err=%s",
163                      chunk_bytes,
164                      mecab_strerror(tokenizer->mecab));
165     return GRN_FALSE;
166   }
167 
168   if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
169     GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
170   }
171 
172   tokenized_chunk_length = strlen(tokenized_chunk);
173   if (tokenized_chunk_length >= 1 &&
174       isspace((unsigned char)tokenized_chunk[tokenized_chunk_length - 1])) {
175     GRN_TEXT_PUT(ctx, &(tokenizer->buf),
176                  tokenized_chunk, tokenized_chunk_length - 1);
177   } else {
178     GRN_TEXT_PUT(ctx, &(tokenizer->buf),
179                  tokenized_chunk, tokenized_chunk_length);
180   }
181 
182   return GRN_TRUE;
183 }
184 
185 static grn_bool
chunked_tokenize_utf8(grn_ctx * ctx,grn_mecab_tokenizer * tokenizer,const char * string,unsigned int string_bytes)186 chunked_tokenize_utf8(grn_ctx *ctx,
187                       grn_mecab_tokenizer *tokenizer,
188                       const char *string,
189                       unsigned int string_bytes)
190 {
191   const char *chunk_start;
192   const char *current;
193   const char *last_delimiter;
194   const char *string_end = string + string_bytes;
195   grn_encoding encoding = tokenizer->query->encoding;
196 
197   if (string_bytes < grn_mecab_chunk_size_threshold) {
198     return chunked_tokenize_utf8_chunk(ctx,
199                                        tokenizer,
200                                        string,
201                                        string_bytes);
202   }
203 
204   chunk_start = current = string;
205   last_delimiter = NULL;
206   while (current < string_end) {
207     int space_bytes;
208     int character_bytes;
209     const char *current_character;
210 
211     space_bytes = grn_isspace(current, encoding);
212     if (space_bytes > 0) {
213       if (chunk_start != current) {
214         grn_bool succeeded;
215         succeeded = chunked_tokenize_utf8_chunk(ctx,
216                                                 tokenizer,
217                                                 chunk_start,
218                                                 current - chunk_start);
219         if (!succeeded) {
220           return succeeded;
221         }
222       }
223       current += space_bytes;
224       chunk_start = current;
225       last_delimiter = NULL;
226       continue;
227     }
228 
229     character_bytes = grn_charlen_(ctx, current, string_end, encoding);
230     if (character_bytes == 0) {
231       GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
232                        "[tokenizer][mecab][chunk] "
233                        "invalid byte sequence: position=%d",
234                        (int)(current - string));
235       return GRN_FALSE;
236     }
237 
238     current_character = current;
239     current += character_bytes;
240     if (is_delimiter_character(ctx, current_character, character_bytes)) {
241       last_delimiter = current;
242     }
243 
244     if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
245       grn_bool succeeded;
246       if (last_delimiter) {
247         succeeded = chunked_tokenize_utf8_chunk(ctx,
248                                                 tokenizer,
249                                                 chunk_start,
250                                                 last_delimiter - chunk_start);
251         chunk_start = last_delimiter;
252       } else {
253         succeeded = chunked_tokenize_utf8_chunk(ctx,
254                                                 tokenizer,
255                                                 chunk_start,
256                                                 current - chunk_start);
257         chunk_start = current;
258       }
259       if (!succeeded) {
260         return succeeded;
261       }
262       last_delimiter = NULL;
263     }
264   }
265 
266   if (current == chunk_start) {
267     return GRN_TRUE;
268   } else {
269     return chunked_tokenize_utf8_chunk(ctx,
270                                        tokenizer,
271                                        chunk_start,
272                                        current - chunk_start);
273   }
274 }
275 
276 static mecab_t *
mecab_create(grn_ctx * ctx)277 mecab_create(grn_ctx *ctx)
278 {
279   mecab_t *mecab;
280   int argc = 0;
281   const char *argv[4];
282 
283   argv[argc++] = "Groonga";
284   argv[argc++] = "-Owakati";
285 #ifdef GRN_WITH_BUNDLED_MECAB
286   argv[argc++] = "--rcfile";
287 # ifdef WIN32
288   {
289     static char windows_mecab_rc_file[PATH_MAX];
290 
291     grn_strcpy(windows_mecab_rc_file,
292                PATH_MAX,
293                grn_plugin_windows_base_dir());
294     grn_strcat(windows_mecab_rc_file,
295                PATH_MAX,
296                "/");
297     grn_strcat(windows_mecab_rc_file,
298                PATH_MAX,
299                GRN_BUNDLED_MECAB_RELATIVE_RC_PATH);
300     {
301       char *c;
302       for (c = windows_mecab_rc_file; *c != '\0'; c++) {
303         if (*c == '/') {
304           *c = '\\';
305         }
306       }
307     }
308     argv[argc++] = windows_mecab_rc_file;
309   }
310 # else /* WIN32 */
311   argv[argc++] = GRN_BUNDLED_MECAB_RC_PATH;
312 # endif /* WIN32 */
313 #endif /* GRN_WITH_BUNDLED_MECAB */
314   mecab = mecab_new(argc, (char **)argv);
315 
316   if (!mecab) {
317 #ifdef GRN_WITH_BUNDLED_MECAB
318     GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
319                      "[tokenizer][mecab] failed to create mecab_t: %s: "
320                      "mecab_new(\"%s\", \"%s\", \"%s\", \"%s\")",
321                      mecab_global_error_message(),
322                      argv[0], argv[1], argv[2], argv[3]);
323 #else /* GRN_WITH_BUNDLED_MECAB */
324     GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
325                      "[tokenizer][mecab] failed to create mecab_t: %s: "
326                      "mecab_new(\"%s\", \"%s\")",
327                      mecab_global_error_message(),
328                      argv[0], argv[1]);
329 #endif /* GRN_WITH_BUNDLED_MECAB */
330   }
331 
332   return mecab;
333 }
334 
335 /*
336   This function is called for a full text search query or a document to be
337   indexed. This means that both short/long strings are given.
338   The return value of this function is ignored. When an error occurs in this
339   function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
340  */
341 static grn_obj *
mecab_init(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)342 mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
343 {
344   grn_mecab_tokenizer *tokenizer;
345   unsigned int normalizer_flags = 0;
346   grn_tokenizer_query *query;
347   grn_obj *normalized_query;
348   const char *normalized_string;
349   unsigned int normalized_string_length;
350 
351   query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
352   if (!query) {
353     return NULL;
354   }
355   if (!sole_mecab) {
356     grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
357     if (!sole_mecab) {
358       sole_mecab = mecab_create(ctx);
359       if (sole_mecab) {
360         sole_mecab_encoding = get_mecab_encoding(sole_mecab);
361       }
362     }
363     grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
364   }
365   if (!sole_mecab) {
366     grn_tokenizer_query_close(ctx, query);
367     return NULL;
368   }
369 
370   if (query->encoding != sole_mecab_encoding) {
371     grn_tokenizer_query_close(ctx, query);
372     GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
373                      "[tokenizer][mecab] "
374                      "MeCab dictionary charset (%s) does not match "
375                      "the table encoding: <%s>",
376                      grn_encoding_to_string(sole_mecab_encoding),
377                      grn_encoding_to_string(query->encoding));
378     return NULL;
379   }
380 
381   if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
382     grn_tokenizer_query_close(ctx, query);
383     GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
384                      "[tokenizer][mecab] "
385                      "memory allocation to grn_mecab_tokenizer failed");
386     return NULL;
387   }
388   tokenizer->mecab = sole_mecab;
389   tokenizer->query = query;
390 
391   normalized_query = query->normalized_query;
392   grn_string_get_normalized(ctx,
393                             normalized_query,
394                             &normalized_string,
395                             &normalized_string_length,
396                             NULL);
397   GRN_TEXT_INIT(&(tokenizer->buf), 0);
398   if (query->have_tokenized_delimiter) {
399     tokenizer->next = normalized_string;
400     tokenizer->end = tokenizer->next + normalized_string_length;
401   } else if (normalized_string_length == 0) {
402     tokenizer->next = "";
403     tokenizer->end = tokenizer->next;
404   } else {
405     grn_bool succeeded;
406     grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
407     if (grn_mecab_chunked_tokenize_enabled &&
408         ctx->encoding == GRN_ENC_UTF8) {
409       succeeded = chunked_tokenize_utf8(ctx,
410                                         tokenizer,
411                                         normalized_string,
412                                         normalized_string_length);
413     } else {
414       const char *s;
415       s = mecab_sparse_tostr2(tokenizer->mecab,
416                               normalized_string,
417                               normalized_string_length);
418       if (!s) {
419         succeeded = GRN_FALSE;
420         GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
421                          "[tokenizer][mecab] "
422                          "mecab_sparse_tostr() failed len=%d err=%s",
423                          normalized_string_length,
424                          mecab_strerror(tokenizer->mecab));
425       } else {
426         succeeded = GRN_TRUE;
427         GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
428       }
429     }
430     grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
431     if (!succeeded) {
432       grn_tokenizer_query_close(ctx, tokenizer->query);
433       GRN_PLUGIN_FREE(ctx, tokenizer);
434       return NULL;
435     }
436     {
437       char *buf, *p;
438       unsigned int bufsize;
439 
440       buf = GRN_TEXT_VALUE(&(tokenizer->buf));
441       bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
442       /* A certain version of mecab returns trailing lf or spaces. */
443       for (p = buf + bufsize - 2;
444            buf <= p && isspace(*(unsigned char *)p);
445            p--) { *p = '\0'; }
446       tokenizer->next = buf;
447       tokenizer->end = p + 1;
448     }
449   }
450   user_data->ptr = tokenizer;
451 
452   grn_tokenizer_token_init(ctx, &(tokenizer->token));
453 
454   return NULL;
455 }
456 
457 /*
458   This function returns tokens one by one.
459  */
460 static grn_obj *
mecab_next(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)461 mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
462 {
463   /* grn_obj *table = args[0]; */
464   grn_mecab_tokenizer *tokenizer = user_data->ptr;
465   grn_encoding encoding = tokenizer->query->encoding;
466 
467   if (tokenizer->query->have_tokenized_delimiter) {
468     tokenizer->next =
469       grn_tokenizer_tokenized_delimiter_next(ctx,
470                                              &(tokenizer->token),
471                                              tokenizer->next,
472                                              tokenizer->end - tokenizer->next,
473                                              encoding);
474   } else {
475     size_t cl;
476     const char *p = tokenizer->next, *r;
477     const char *e = tokenizer->end;
478     grn_tokenizer_status status;
479 
480     for (r = p; r < e; r += cl) {
481       int space_len;
482 
483       space_len = grn_isspace(r, encoding);
484       if (space_len > 0 && r == p) {
485         cl = space_len;
486         p = r + cl;
487         continue;
488       }
489 
490       if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
491         tokenizer->next = e;
492         break;
493       }
494 
495       if (space_len > 0) {
496         const char *q = r + space_len;
497         while (q < e && (space_len = grn_isspace(q, encoding))) {
498           q += space_len;
499         }
500         tokenizer->next = q;
501         break;
502       }
503     }
504 
505     if (r == e || tokenizer->next == e) {
506       status = GRN_TOKENIZER_LAST;
507     } else {
508       status = GRN_TOKENIZER_CONTINUE;
509     }
510     grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
511   }
512 
513   return NULL;
514 }
515 
516 /*
517   This function finalizes a tokenization.
518  */
519 static grn_obj *
mecab_fin(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)520 mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
521 {
522   grn_mecab_tokenizer *tokenizer = user_data->ptr;
523   if (!tokenizer) {
524     return NULL;
525   }
526   grn_tokenizer_token_fin(ctx, &(tokenizer->token));
527   grn_tokenizer_query_close(ctx, tokenizer->query);
528   grn_obj_unlink(ctx, &(tokenizer->buf));
529   GRN_PLUGIN_FREE(ctx, tokenizer);
530   return NULL;
531 }
532 
533 static void
check_mecab_dictionary_encoding(grn_ctx * ctx)534 check_mecab_dictionary_encoding(grn_ctx *ctx)
535 {
536 #ifdef HAVE_MECAB_DICTIONARY_INFO_T
537   mecab_t *mecab;
538   grn_encoding encoding;
539   grn_bool have_same_encoding_dictionary;
540 
541   mecab = mecab_create(ctx);
542   if (!mecab) {
543     return;
544   }
545 
546   encoding = GRN_CTX_GET_ENCODING(ctx);
547   have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
548   mecab_destroy(mecab);
549 
550   if (!have_same_encoding_dictionary) {
551     GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
552                      "[tokenizer][mecab] "
553                      "MeCab has no dictionary that uses the context encoding"
554                      ": <%s>",
555                      grn_encoding_to_string(encoding));
556   }
557 #endif
558 }
559 
560 /*
561   This function initializes a plugin. This function fails if there is no
562   dictionary that uses the context encoding of groonga.
563  */
564 grn_rc
GRN_PLUGIN_INIT(grn_ctx * ctx)565 GRN_PLUGIN_INIT(grn_ctx *ctx)
566 {
567   ++sole_mecab_init_counter;
568   if (sole_mecab_init_counter > 1)
569   {
570     return GRN_SUCCESS;
571   }
572   {
573     char env[GRN_ENV_BUFFER_SIZE];
574 
575     grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED",
576                env,
577                GRN_ENV_BUFFER_SIZE);
578     grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0);
579   }
580 
581   {
582     char env[GRN_ENV_BUFFER_SIZE];
583 
584     grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD",
585                env,
586                GRN_ENV_BUFFER_SIZE);
587     if (env[0]) {
588       int threshold = -1;
589       const char *end;
590       const char *rest;
591 
592       end = env + strlen(env);
593       threshold = grn_atoi(env, end, &rest);
594       if (end > env && end == rest) {
595         grn_mecab_chunk_size_threshold = threshold;
596       }
597     }
598   }
599 
600   sole_mecab = NULL;
601   sole_mecab_mutex = grn_plugin_mutex_open(ctx);
602   if (!sole_mecab_mutex) {
603     GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
604                      "[tokenizer][mecab] grn_plugin_mutex_open() failed");
605     return ctx->rc;
606   }
607 
608   check_mecab_dictionary_encoding(ctx);
609   if (ctx->rc != GRN_SUCCESS) {
610     grn_plugin_mutex_close(ctx, sole_mecab_mutex);
611     sole_mecab_mutex = NULL;
612   }
613 
614   return ctx->rc;
615 }
616 
617 /*
618   This function registers a plugin to a database.
619  */
620 grn_rc
GRN_PLUGIN_REGISTER(grn_ctx * ctx)621 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
622 {
623   grn_rc rc;
624 
625   rc = grn_tokenizer_register(ctx, "TokenMecab", 10,
626                               mecab_init, mecab_next, mecab_fin);
627   if (rc == GRN_SUCCESS) {
628     grn_obj *token_mecab;
629     token_mecab = grn_ctx_get(ctx, "TokenMecab", 10);
630     /* Just for backward compatibility. TokenMecab was built-in not plugin. */
631     if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) {
632       rc = GRN_FILE_CORRUPT;
633     }
634   }
635 
636   return rc;
637 }
638 
639 /*
640   This function finalizes a plugin.
641  */
642 grn_rc
GRN_PLUGIN_FIN(grn_ctx * ctx)643 GRN_PLUGIN_FIN(grn_ctx *ctx)
644 {
645   --sole_mecab_init_counter;
646   if (sole_mecab_init_counter > 0)
647   {
648     return GRN_SUCCESS;
649   }
650   if (sole_mecab) {
651     mecab_destroy(sole_mecab);
652     sole_mecab = NULL;
653   }
654   if (sole_mecab_mutex) {
655     grn_plugin_mutex_close(ctx, sole_mecab_mutex);
656     sole_mecab_mutex = NULL;
657   }
658 
659   return GRN_SUCCESS;
660 }
661