1 /* -*- c-basic-offset: 2 -*- */
2 /*
3 Copyright(C) 2009-2016 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 */
18
19 #ifdef GRN_EMBEDDED
20 # define GRN_PLUGIN_FUNCTION_TAG tokenizers_mecab
21 #endif
22
23 #include <grn_str.h>
24
25 #include <groonga.h>
26 #include <groonga/tokenizer.h>
27
28 #include <mecab.h>
29
30 #include <stdlib.h>
31 #include <string.h>
32 #include <ctype.h>
33
34 static unsigned int sole_mecab_init_counter = 0;
35 static mecab_t *sole_mecab = NULL;
36 static grn_plugin_mutex *sole_mecab_mutex = NULL;
37 static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
38
39 static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
40 static int grn_mecab_chunk_size_threshold = 8192;
41
42 typedef struct {
43 mecab_t *mecab;
44 grn_obj buf;
45 const char *next;
46 const char *end;
47 grn_tokenizer_query *query;
48 grn_tokenizer_token token;
49 } grn_mecab_tokenizer;
50
51 static const char *
mecab_global_error_message(void)52 mecab_global_error_message(void)
53 {
54 double version;
55
56 version = atof(mecab_version());
57 /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */
58 if (version <= 0.993) {
59 return "Unknown";
60 }
61
62 return mecab_strerror(NULL);
63 }
64
65
66 static grn_encoding
translate_mecab_charset_to_grn_encoding(const char * charset)67 translate_mecab_charset_to_grn_encoding(const char *charset)
68 {
69 if (grn_strcasecmp(charset, "euc-jp") == 0) {
70 return GRN_ENC_EUC_JP;
71 } else if (grn_strcasecmp(charset, "utf-8") == 0 ||
72 grn_strcasecmp(charset, "utf8") == 0) {
73 return GRN_ENC_UTF8;
74 } else if (grn_strcasecmp(charset, "shift_jis") == 0 ||
75 grn_strcasecmp(charset, "shift-jis") == 0 ||
76 grn_strcasecmp(charset, "sjis") == 0) {
77 return GRN_ENC_SJIS;
78 }
79 return GRN_ENC_NONE;
80 }
81
82 static grn_encoding
get_mecab_encoding(mecab_t * mecab)83 get_mecab_encoding(mecab_t *mecab)
84 {
85 grn_encoding encoding = GRN_ENC_NONE;
86 const mecab_dictionary_info_t *dictionary_info;
87 dictionary_info = mecab_dictionary_info(mecab);
88 if (dictionary_info) {
89 const char *charset = dictionary_info->charset;
90 encoding = translate_mecab_charset_to_grn_encoding(charset);
91 }
92 return encoding;
93 }
94
95 static inline grn_bool
is_delimiter_character(grn_ctx * ctx,const char * character,int character_bytes)96 is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes)
97 {
98 switch (character_bytes) {
99 case 1 :
100 switch (character[0]) {
101 case ',' :
102 case '.' :
103 case '!' :
104 case '?' :
105 return GRN_TRUE;
106 default :
107 return GRN_FALSE;
108 }
109 case 3 :
110 switch ((unsigned char)(character[0])) {
111 case 0xE3 :
112 switch ((unsigned char)(character[1])) {
113 case 0x80 :
114 switch ((unsigned char)(character[2])) {
115 case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */
116 case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */
117 return GRN_TRUE;
118 default :
119 return GRN_FALSE;
120 }
121 default :
122 return GRN_FALSE;
123 }
124 return GRN_FALSE;
125 case 0xEF :
126 switch ((unsigned char)(character[1])) {
127 case 0xBC :
128 switch ((unsigned char)(character[2])) {
129 case 0x81 :
130 /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */
131 case 0x9F :
132 /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */
133 return GRN_TRUE;
134 default :
135 return GRN_FALSE;
136 }
137 default :
138 return GRN_FALSE;
139 }
140 return GRN_FALSE;
141 default :
142 return GRN_FALSE;
143 }
144 default :
145 return GRN_FALSE;
146 }
147 }
148
149 static grn_bool
chunked_tokenize_utf8_chunk(grn_ctx * ctx,grn_mecab_tokenizer * tokenizer,const char * chunk,unsigned int chunk_bytes)150 chunked_tokenize_utf8_chunk(grn_ctx *ctx,
151 grn_mecab_tokenizer *tokenizer,
152 const char *chunk,
153 unsigned int chunk_bytes)
154 {
155 const char *tokenized_chunk;
156 size_t tokenized_chunk_length;
157
158 tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
159 if (!tokenized_chunk) {
160 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
161 "[tokenizer][mecab][chunk] "
162 "mecab_sparse_tostr2() failed len=%d err=%s",
163 chunk_bytes,
164 mecab_strerror(tokenizer->mecab));
165 return GRN_FALSE;
166 }
167
168 if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
169 GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
170 }
171
172 tokenized_chunk_length = strlen(tokenized_chunk);
173 if (tokenized_chunk_length >= 1 &&
174 isspace((unsigned char)tokenized_chunk[tokenized_chunk_length - 1])) {
175 GRN_TEXT_PUT(ctx, &(tokenizer->buf),
176 tokenized_chunk, tokenized_chunk_length - 1);
177 } else {
178 GRN_TEXT_PUT(ctx, &(tokenizer->buf),
179 tokenized_chunk, tokenized_chunk_length);
180 }
181
182 return GRN_TRUE;
183 }
184
185 static grn_bool
chunked_tokenize_utf8(grn_ctx * ctx,grn_mecab_tokenizer * tokenizer,const char * string,unsigned int string_bytes)186 chunked_tokenize_utf8(grn_ctx *ctx,
187 grn_mecab_tokenizer *tokenizer,
188 const char *string,
189 unsigned int string_bytes)
190 {
191 const char *chunk_start;
192 const char *current;
193 const char *last_delimiter;
194 const char *string_end = string + string_bytes;
195 grn_encoding encoding = tokenizer->query->encoding;
196
197 if (string_bytes < grn_mecab_chunk_size_threshold) {
198 return chunked_tokenize_utf8_chunk(ctx,
199 tokenizer,
200 string,
201 string_bytes);
202 }
203
204 chunk_start = current = string;
205 last_delimiter = NULL;
206 while (current < string_end) {
207 int space_bytes;
208 int character_bytes;
209 const char *current_character;
210
211 space_bytes = grn_isspace(current, encoding);
212 if (space_bytes > 0) {
213 if (chunk_start != current) {
214 grn_bool succeeded;
215 succeeded = chunked_tokenize_utf8_chunk(ctx,
216 tokenizer,
217 chunk_start,
218 current - chunk_start);
219 if (!succeeded) {
220 return succeeded;
221 }
222 }
223 current += space_bytes;
224 chunk_start = current;
225 last_delimiter = NULL;
226 continue;
227 }
228
229 character_bytes = grn_charlen_(ctx, current, string_end, encoding);
230 if (character_bytes == 0) {
231 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
232 "[tokenizer][mecab][chunk] "
233 "invalid byte sequence: position=%d",
234 (int)(current - string));
235 return GRN_FALSE;
236 }
237
238 current_character = current;
239 current += character_bytes;
240 if (is_delimiter_character(ctx, current_character, character_bytes)) {
241 last_delimiter = current;
242 }
243
244 if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
245 grn_bool succeeded;
246 if (last_delimiter) {
247 succeeded = chunked_tokenize_utf8_chunk(ctx,
248 tokenizer,
249 chunk_start,
250 last_delimiter - chunk_start);
251 chunk_start = last_delimiter;
252 } else {
253 succeeded = chunked_tokenize_utf8_chunk(ctx,
254 tokenizer,
255 chunk_start,
256 current - chunk_start);
257 chunk_start = current;
258 }
259 if (!succeeded) {
260 return succeeded;
261 }
262 last_delimiter = NULL;
263 }
264 }
265
266 if (current == chunk_start) {
267 return GRN_TRUE;
268 } else {
269 return chunked_tokenize_utf8_chunk(ctx,
270 tokenizer,
271 chunk_start,
272 current - chunk_start);
273 }
274 }
275
276 static mecab_t *
mecab_create(grn_ctx * ctx)277 mecab_create(grn_ctx *ctx)
278 {
279 mecab_t *mecab;
280 int argc = 0;
281 const char *argv[4];
282
283 argv[argc++] = "Groonga";
284 argv[argc++] = "-Owakati";
285 #ifdef GRN_WITH_BUNDLED_MECAB
286 argv[argc++] = "--rcfile";
287 # ifdef WIN32
288 {
289 static char windows_mecab_rc_file[PATH_MAX];
290
291 grn_strcpy(windows_mecab_rc_file,
292 PATH_MAX,
293 grn_plugin_windows_base_dir());
294 grn_strcat(windows_mecab_rc_file,
295 PATH_MAX,
296 "/");
297 grn_strcat(windows_mecab_rc_file,
298 PATH_MAX,
299 GRN_BUNDLED_MECAB_RELATIVE_RC_PATH);
300 {
301 char *c;
302 for (c = windows_mecab_rc_file; *c != '\0'; c++) {
303 if (*c == '/') {
304 *c = '\\';
305 }
306 }
307 }
308 argv[argc++] = windows_mecab_rc_file;
309 }
310 # else /* WIN32 */
311 argv[argc++] = GRN_BUNDLED_MECAB_RC_PATH;
312 # endif /* WIN32 */
313 #endif /* GRN_WITH_BUNDLED_MECAB */
314 mecab = mecab_new(argc, (char **)argv);
315
316 if (!mecab) {
317 #ifdef GRN_WITH_BUNDLED_MECAB
318 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
319 "[tokenizer][mecab] failed to create mecab_t: %s: "
320 "mecab_new(\"%s\", \"%s\", \"%s\", \"%s\")",
321 mecab_global_error_message(),
322 argv[0], argv[1], argv[2], argv[3]);
323 #else /* GRN_WITH_BUNDLED_MECAB */
324 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
325 "[tokenizer][mecab] failed to create mecab_t: %s: "
326 "mecab_new(\"%s\", \"%s\")",
327 mecab_global_error_message(),
328 argv[0], argv[1]);
329 #endif /* GRN_WITH_BUNDLED_MECAB */
330 }
331
332 return mecab;
333 }
334
335 /*
336 This function is called for a full text search query or a document to be
337 indexed. This means that both short/long strings are given.
338 The return value of this function is ignored. When an error occurs in this
339 function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
340 */
341 static grn_obj *
mecab_init(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)342 mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
343 {
344 grn_mecab_tokenizer *tokenizer;
345 unsigned int normalizer_flags = 0;
346 grn_tokenizer_query *query;
347 grn_obj *normalized_query;
348 const char *normalized_string;
349 unsigned int normalized_string_length;
350
351 query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
352 if (!query) {
353 return NULL;
354 }
355 if (!sole_mecab) {
356 grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
357 if (!sole_mecab) {
358 sole_mecab = mecab_create(ctx);
359 if (sole_mecab) {
360 sole_mecab_encoding = get_mecab_encoding(sole_mecab);
361 }
362 }
363 grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
364 }
365 if (!sole_mecab) {
366 grn_tokenizer_query_close(ctx, query);
367 return NULL;
368 }
369
370 if (query->encoding != sole_mecab_encoding) {
371 grn_tokenizer_query_close(ctx, query);
372 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
373 "[tokenizer][mecab] "
374 "MeCab dictionary charset (%s) does not match "
375 "the table encoding: <%s>",
376 grn_encoding_to_string(sole_mecab_encoding),
377 grn_encoding_to_string(query->encoding));
378 return NULL;
379 }
380
381 if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
382 grn_tokenizer_query_close(ctx, query);
383 GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
384 "[tokenizer][mecab] "
385 "memory allocation to grn_mecab_tokenizer failed");
386 return NULL;
387 }
388 tokenizer->mecab = sole_mecab;
389 tokenizer->query = query;
390
391 normalized_query = query->normalized_query;
392 grn_string_get_normalized(ctx,
393 normalized_query,
394 &normalized_string,
395 &normalized_string_length,
396 NULL);
397 GRN_TEXT_INIT(&(tokenizer->buf), 0);
398 if (query->have_tokenized_delimiter) {
399 tokenizer->next = normalized_string;
400 tokenizer->end = tokenizer->next + normalized_string_length;
401 } else if (normalized_string_length == 0) {
402 tokenizer->next = "";
403 tokenizer->end = tokenizer->next;
404 } else {
405 grn_bool succeeded;
406 grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
407 if (grn_mecab_chunked_tokenize_enabled &&
408 ctx->encoding == GRN_ENC_UTF8) {
409 succeeded = chunked_tokenize_utf8(ctx,
410 tokenizer,
411 normalized_string,
412 normalized_string_length);
413 } else {
414 const char *s;
415 s = mecab_sparse_tostr2(tokenizer->mecab,
416 normalized_string,
417 normalized_string_length);
418 if (!s) {
419 succeeded = GRN_FALSE;
420 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
421 "[tokenizer][mecab] "
422 "mecab_sparse_tostr() failed len=%d err=%s",
423 normalized_string_length,
424 mecab_strerror(tokenizer->mecab));
425 } else {
426 succeeded = GRN_TRUE;
427 GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
428 }
429 }
430 grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
431 if (!succeeded) {
432 grn_tokenizer_query_close(ctx, tokenizer->query);
433 GRN_PLUGIN_FREE(ctx, tokenizer);
434 return NULL;
435 }
436 {
437 char *buf, *p;
438 unsigned int bufsize;
439
440 buf = GRN_TEXT_VALUE(&(tokenizer->buf));
441 bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
442 /* A certain version of mecab returns trailing lf or spaces. */
443 for (p = buf + bufsize - 2;
444 buf <= p && isspace(*(unsigned char *)p);
445 p--) { *p = '\0'; }
446 tokenizer->next = buf;
447 tokenizer->end = p + 1;
448 }
449 }
450 user_data->ptr = tokenizer;
451
452 grn_tokenizer_token_init(ctx, &(tokenizer->token));
453
454 return NULL;
455 }
456
457 /*
458 This function returns tokens one by one.
459 */
460 static grn_obj *
mecab_next(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)461 mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
462 {
463 /* grn_obj *table = args[0]; */
464 grn_mecab_tokenizer *tokenizer = user_data->ptr;
465 grn_encoding encoding = tokenizer->query->encoding;
466
467 if (tokenizer->query->have_tokenized_delimiter) {
468 tokenizer->next =
469 grn_tokenizer_tokenized_delimiter_next(ctx,
470 &(tokenizer->token),
471 tokenizer->next,
472 tokenizer->end - tokenizer->next,
473 encoding);
474 } else {
475 size_t cl;
476 const char *p = tokenizer->next, *r;
477 const char *e = tokenizer->end;
478 grn_tokenizer_status status;
479
480 for (r = p; r < e; r += cl) {
481 int space_len;
482
483 space_len = grn_isspace(r, encoding);
484 if (space_len > 0 && r == p) {
485 cl = space_len;
486 p = r + cl;
487 continue;
488 }
489
490 if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
491 tokenizer->next = e;
492 break;
493 }
494
495 if (space_len > 0) {
496 const char *q = r + space_len;
497 while (q < e && (space_len = grn_isspace(q, encoding))) {
498 q += space_len;
499 }
500 tokenizer->next = q;
501 break;
502 }
503 }
504
505 if (r == e || tokenizer->next == e) {
506 status = GRN_TOKENIZER_LAST;
507 } else {
508 status = GRN_TOKENIZER_CONTINUE;
509 }
510 grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
511 }
512
513 return NULL;
514 }
515
516 /*
517 This function finalizes a tokenization.
518 */
519 static grn_obj *
mecab_fin(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)520 mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
521 {
522 grn_mecab_tokenizer *tokenizer = user_data->ptr;
523 if (!tokenizer) {
524 return NULL;
525 }
526 grn_tokenizer_token_fin(ctx, &(tokenizer->token));
527 grn_tokenizer_query_close(ctx, tokenizer->query);
528 grn_obj_unlink(ctx, &(tokenizer->buf));
529 GRN_PLUGIN_FREE(ctx, tokenizer);
530 return NULL;
531 }
532
533 static void
check_mecab_dictionary_encoding(grn_ctx * ctx)534 check_mecab_dictionary_encoding(grn_ctx *ctx)
535 {
536 #ifdef HAVE_MECAB_DICTIONARY_INFO_T
537 mecab_t *mecab;
538 grn_encoding encoding;
539 grn_bool have_same_encoding_dictionary;
540
541 mecab = mecab_create(ctx);
542 if (!mecab) {
543 return;
544 }
545
546 encoding = GRN_CTX_GET_ENCODING(ctx);
547 have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
548 mecab_destroy(mecab);
549
550 if (!have_same_encoding_dictionary) {
551 GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
552 "[tokenizer][mecab] "
553 "MeCab has no dictionary that uses the context encoding"
554 ": <%s>",
555 grn_encoding_to_string(encoding));
556 }
557 #endif
558 }
559
560 /*
561 This function initializes a plugin. This function fails if there is no
562 dictionary that uses the context encoding of groonga.
563 */
564 grn_rc
GRN_PLUGIN_INIT(grn_ctx * ctx)565 GRN_PLUGIN_INIT(grn_ctx *ctx)
566 {
567 ++sole_mecab_init_counter;
568 if (sole_mecab_init_counter > 1)
569 {
570 return GRN_SUCCESS;
571 }
572 {
573 char env[GRN_ENV_BUFFER_SIZE];
574
575 grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED",
576 env,
577 GRN_ENV_BUFFER_SIZE);
578 grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0);
579 }
580
581 {
582 char env[GRN_ENV_BUFFER_SIZE];
583
584 grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD",
585 env,
586 GRN_ENV_BUFFER_SIZE);
587 if (env[0]) {
588 int threshold = -1;
589 const char *end;
590 const char *rest;
591
592 end = env + strlen(env);
593 threshold = grn_atoi(env, end, &rest);
594 if (end > env && end == rest) {
595 grn_mecab_chunk_size_threshold = threshold;
596 }
597 }
598 }
599
600 sole_mecab = NULL;
601 sole_mecab_mutex = grn_plugin_mutex_open(ctx);
602 if (!sole_mecab_mutex) {
603 GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
604 "[tokenizer][mecab] grn_plugin_mutex_open() failed");
605 return ctx->rc;
606 }
607
608 check_mecab_dictionary_encoding(ctx);
609 if (ctx->rc != GRN_SUCCESS) {
610 grn_plugin_mutex_close(ctx, sole_mecab_mutex);
611 sole_mecab_mutex = NULL;
612 }
613
614 return ctx->rc;
615 }
616
617 /*
618 This function registers a plugin to a database.
619 */
620 grn_rc
GRN_PLUGIN_REGISTER(grn_ctx * ctx)621 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
622 {
623 grn_rc rc;
624
625 rc = grn_tokenizer_register(ctx, "TokenMecab", 10,
626 mecab_init, mecab_next, mecab_fin);
627 if (rc == GRN_SUCCESS) {
628 grn_obj *token_mecab;
629 token_mecab = grn_ctx_get(ctx, "TokenMecab", 10);
630 /* Just for backward compatibility. TokenMecab was built-in not plugin. */
631 if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) {
632 rc = GRN_FILE_CORRUPT;
633 }
634 }
635
636 return rc;
637 }
638
639 /*
640 This function finalizes a plugin.
641 */
642 grn_rc
GRN_PLUGIN_FIN(grn_ctx * ctx)643 GRN_PLUGIN_FIN(grn_ctx *ctx)
644 {
645 --sole_mecab_init_counter;
646 if (sole_mecab_init_counter > 0)
647 {
648 return GRN_SUCCESS;
649 }
650 if (sole_mecab) {
651 mecab_destroy(sole_mecab);
652 sole_mecab = NULL;
653 }
654 if (sole_mecab_mutex) {
655 grn_plugin_mutex_close(ctx, sole_mecab_mutex);
656 sole_mecab_mutex = NULL;
657 }
658
659 return GRN_SUCCESS;
660 }
661