1 /* Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 #include "my_config.h"
24 #include "mysqld_error.h"
25 #include <string>
26 #include <log.h>
27 #include <mecab.h>
28 #include <fts0tokenize.h>
29
30 /* We are following InnoDB coding guidelines. */
31
32 /** Global mecab objects shared by all threads. */
33 static MeCab::Model* mecab_model = NULL;
34 static MeCab::Tagger* mecab_tagger = NULL;
35
36 /** Mecab charset. */
37 static char mecab_charset[64];
38
39 /** Mecab rc file path. */
40 static char* mecab_rc_file;
41
42 static const char* mecab_min_supported_version = "0.993";
43 static const char* mecab_max_supported_version = "0.996";
44
45 #if defined(BUNDLE_MECAB)
46 static const bool bundle_mecab= true;
47 #else
48 static const bool bundle_mecab= false;
49 #endif
50
51 /** Set MeCab parser charset.
52 @param[in] charset charset string
53 @retval true on success
54 @retval false on failure */
55 static
56 bool
mecab_parser_check_and_set_charset(const char * charset)57 mecab_parser_check_and_set_charset(
58 const char* charset)
59 {
60 /* Array used to map mecab charset to mysql charset. */
61 static const int mecab_charset_count = 4;
62 static const char* mecab_charset_values[mecab_charset_count][2] = {
63 {"euc-jp", "ujis"},
64 {"sjis", "sjis"},
65 {"utf-8", "utf8"},
66 {"utf8", "utf8"}
67 };
68
69 for (int i = 0; i < mecab_charset_count; i++) {
70 if (native_strcasecmp(charset, mecab_charset_values[i][0])
71 == 0 ) {
72 strcpy(mecab_charset, mecab_charset_values[i][1]);
73 return(true);
74 }
75 }
76
77 return(false);
78 }
79
80 /** MeCab parser plugin initialization.
81 @retval 0 on success
82 @retval 1 on failure. */
83 static
84 int
mecab_parser_plugin_init(void *)85 mecab_parser_plugin_init(void*)
86 {
87 const MeCab::DictionaryInfo* mecab_dict;
88
89 /* Check mecab version. */
90 if (strcmp(MeCab::Model::version(), mecab_min_supported_version) < 0) {
91 sql_print_error("Mecab v%s is not supported,"
92 " the lowest version supported is v%s.",
93 MeCab::Model::version(),
94 mecab_min_supported_version);
95 return(1);
96 }
97
98 if (strcmp(MeCab::Model::version(), mecab_max_supported_version) > 0) {
99 sql_print_warning("Mecab v%s is not verified,"
100 " the highest version supported is v%s.",
101 MeCab::Model::version(),
102 mecab_max_supported_version);
103 }
104
105 if (mecab_rc_file != NULL) {
106 std::string rcfile_arg;
107
108 /* See src/tagger.cpp for available options.
109 --rcfile=<mecabrc file> "use FILE as resource file" */
110 rcfile_arg += "--rcfile=";
111 rcfile_arg += mecab_rc_file;
112
113 /* It seems we *must* have some kind of mecabrc
114 file available before calling createModel, see
115 load_dictionary_resource() in src/utils.cpp */
116 sql_print_information("Mecab: Trying createModel(%s)",
117 rcfile_arg.c_str());
118
119 mecab_model = MeCab::createModel(rcfile_arg.c_str());
120 } else {
121 sql_print_information("Mecab: Trying createModel()");
122 mecab_model = MeCab::createModel("");
123 }
124
125 if (mecab_model == NULL) {
126 sql_print_error("Mecab: createModel() failed: %s",
127 MeCab::getLastError());
128 return(1);
129 }
130
131 mecab_tagger = mecab_model->createTagger();
132 if (mecab_tagger == NULL) {
133 sql_print_error("Mecab: createTagger() failed: %s",
134 MeCab::getLastError());
135 delete mecab_model;
136 mecab_model= NULL;
137 return(1);
138 }
139
140 mecab_dict = mecab_model->dictionary_info();
141 mecab_charset[0] = '\0';
142 if (!mecab_parser_check_and_set_charset(mecab_dict->charset)) {
143 delete mecab_tagger;
144 mecab_tagger = NULL;
145
146 sql_print_error("Mecab: Unsupported dictionary charset %s",
147 mecab_dict->charset);
148
149 delete mecab_model;
150 mecab_model = NULL;
151
152 return(1);
153 } else {
154 sql_print_information("Mecab: Loaded dictionary charset is %s",
155 mecab_dict->charset);
156 return(0);
157 }
158 }
159
160 /** MeCab parser plugin deinit
161 @retval 0 */
162 static
163 int
mecab_parser_plugin_deinit(void *)164 mecab_parser_plugin_deinit(void*)
165 {
166 delete mecab_tagger;
167 mecab_tagger = NULL;
168
169 delete mecab_model;
170 mecab_model = NULL;
171
172 return(0);
173 }
174
175 /** Parse a document by MeCab.
176 @param[in] mecab_lattice mecab lattice
177 @param[in] param plugin parser param
178 @param[in] doc document to parse
179 @param[in] len document length
180 @param[in,out] bool_info boolean info
181 @retvat 0 on success
182 @retval 1 on failure. */
183 static
184 int
mecab_parse(MeCab::Lattice * mecab_lattice,MYSQL_FTPARSER_PARAM * param,char * doc,int len,MYSQL_FTPARSER_BOOLEAN_INFO * bool_info)185 mecab_parse(
186 MeCab::Lattice* mecab_lattice,
187 MYSQL_FTPARSER_PARAM* param,
188 char* doc,
189 int len,
190 MYSQL_FTPARSER_BOOLEAN_INFO*
191 bool_info)
192 {
193 static MYSQL_FTPARSER_BOOLEAN_INFO token_info =
194 { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
195 int position = 0;
196 int token_num = 0;
197 int ret = 0;
198 bool term_converted = false;
199
200 try {
201 mecab_lattice->set_sentence(doc, len);
202
203 if(!mecab_tagger->parse(mecab_lattice)) {
204 sql_print_error("Mecab: parse() failed: %s",
205 mecab_lattice->what());
206 return(1);
207 }
208 } catch (std::bad_alloc const &) {
209 sql_print_error("Mecab: parse() failed: out of memory.");
210
211 return(1);
212 }
213
214 if (param->mode == MYSQL_FTPARSER_FULL_BOOLEAN_INFO) {
215 for (const MeCab::Node* node = mecab_lattice->bos_node();
216 node != NULL; node = node->next) {
217 token_num += 1;
218 }
219
220 /* If the term has more than one token, convert it to a phrase.*/
221 if (bool_info->quot == NULL && token_num > 1) {
222 term_converted = true;
223
224 bool_info->type = FT_TOKEN_LEFT_PAREN;
225 bool_info->quot = reinterpret_cast<char*>(1);
226
227 ret = param->mysql_add_word(param, NULL, 0, bool_info);
228 if (ret != 0) {
229 return(ret);
230 }
231 }
232 }
233
234 for (const MeCab::Node* node = mecab_lattice->bos_node();
235 node != NULL; node = node->next) {
236 bool_info->position = position;
237 position += node->rlength;
238
239 param->mysql_add_word(param, const_cast<char*>(node->surface),
240 node->length,
241 term_converted ? &token_info : bool_info);
242 }
243
244 if (term_converted) {
245 bool_info->type = FT_TOKEN_RIGHT_PAREN;
246 ret = param->mysql_add_word(param, NULL, 0, bool_info);
247
248 DBUG_ASSERT(bool_info->quot == NULL);
249 bool_info->type = FT_TOKEN_WORD;
250 }
251
252 return(ret);
253 }
254
255 /** MeCab parser parse a document.
256 @param[in] param plugin parser param
257 @retval 0 on success
258 @retval 1 on failure. */
259 static
260 int
mecab_parser_parse(MYSQL_FTPARSER_PARAM * param)261 mecab_parser_parse(
262 MYSQL_FTPARSER_PARAM* param)
263 {
264 MeCab::Lattice* mecab_lattice = NULL;
265 MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
266 { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
267 int ret = 0;
268 const char* csname = NULL;
269
270 /* Mecab supports utf8mb4(utf8), eucjpms(ujis) and cp932(sjis). */
271 if (strcmp(param->cs->csname, MY_UTF8MB4) == 0) {
272 csname = "utf8";
273 } else if (strcmp(param->cs->csname, "eucjpms") == 0) {
274 csname = "ujis";
275 } else if (strcmp(param->cs->csname, "cp932") == 0) {
276 csname = "sjis";
277 } else {
278 csname = param->cs->csname;
279 }
280
281 /* Check charset */
282 if (strcmp(mecab_charset, csname) != 0) {
283 char error_msg[128];
284
285 my_snprintf(error_msg, 127, "Fulltext index charset '%s'"
286 " doesn't match mecab charset '%s'.",
287 param->cs->csname, mecab_charset);
288 my_message(ER_ERROR_ON_WRITE, error_msg, MYF(0));
289
290 return(1);
291 }
292
293 DBUG_ASSERT(param->cs->mbminlen == 1);
294
295 /* Create mecab lattice for parsing */
296 mecab_lattice = mecab_model->createLattice();
297 if (mecab_lattice == NULL) {
298 sql_print_error("Mecab: createLattice() failed: %s",
299 MeCab::getLastError());
300 return(1);
301 }
302
303 /* Allocate a new string with '\0' in the end to avoid
304 valgrind error "Invalid read of size 1" in mecab. */
305 DBUG_ASSERT(param->length >= 0);
306 int doc_length = param->length;
307 char* doc = reinterpret_cast<char*>(malloc(doc_length + 1));
308
309 if (doc == NULL) {
310 my_error(ER_OUTOFMEMORY, MYF(0), doc_length);
311 return(1);
312 }
313
314 memcpy(doc, param->doc, doc_length);
315 doc[doc_length]= '\0';
316
317 switch(param->mode) {
318 case MYSQL_FTPARSER_SIMPLE_MODE:
319 case MYSQL_FTPARSER_WITH_STOPWORDS:
320 ret = mecab_parse(mecab_lattice, param, doc,
321 doc_length, &bool_info);
322
323 break;
324
325 case MYSQL_FTPARSER_FULL_BOOLEAN_INFO:
326 uchar* start = reinterpret_cast<uchar*>(doc);
327 uchar* end = start + doc_length;
328 FT_WORD word = {NULL, 0, 0};
329
330 while (fts_get_word(param->cs, &start, end, &word, &bool_info)) {
331 /* Don't convert term with wildcard. */
332 if (bool_info.type == FT_TOKEN_WORD
333 && !bool_info.trunc) {
334 ret = mecab_parse(
335 mecab_lattice,
336 param,
337 reinterpret_cast<char*>(word.pos),
338 word.len,
339 &bool_info);
340 } else {
341 ret = param->mysql_add_word(
342 param,
343 reinterpret_cast<char*>(word.pos),
344 word.len,
345 &bool_info);
346 }
347
348 if (ret != 0) {
349 break;
350 }
351 }
352 }
353
354 free(doc);
355 delete mecab_lattice;
356
357 return(ret);
358 }
359
360 /** Fulltext MeCab Parser Descriptor*/
361 static struct st_mysql_ftparser mecab_parser_descriptor =
362 {
363 MYSQL_FTPARSER_INTERFACE_VERSION,
364 mecab_parser_parse,
365 0,
366 0
367 };
368
369 /* MeCab plugin status variables */
370 static struct st_mysql_show_var mecab_status[] =
371 {
372 {"mecab_charset", mecab_charset, SHOW_CHAR, SHOW_SCOPE_GLOBAL},
373 {0, 0, enum_mysql_show_type(0), SHOW_SCOPE_GLOBAL}
374 };
375
376 static MYSQL_SYSVAR_STR(rc_file, mecab_rc_file,
377 PLUGIN_VAR_READONLY,
378 "MECABRC file path",
379 NULL, NULL, NULL);
380
381 /* MeCab plugin system variables */
382 static struct st_mysql_sys_var* mecab_system_variables[]= {
383 MYSQL_SYSVAR(rc_file),
384 NULL
385 };
386
387 /* MeCab plugin descriptor */
mysql_declare_plugin(mecab_parser)388 mysql_declare_plugin(mecab_parser)
389 {
390 MYSQL_FTPARSER_PLUGIN, /*!< type */
391 &mecab_parser_descriptor, /*!< descriptor */
392 "mecab", /*!< name */
393 "Oracle Corp", /*!< author */
394 "Mecab Full-Text Parser for Japanese", /*!< description*/
395 PLUGIN_LICENSE_GPL, /*!< license */
396 mecab_parser_plugin_init, /*!< init function (when loaded)*/
397 mecab_parser_plugin_deinit, /*!< deinit function (when unloaded)*/
398 0x0001, /*!< version */
399 mecab_status, /*!< status variables */
400 mecab_system_variables, /*!< system variables */
401 NULL,
402 0,
403 }
404 mysql_declare_plugin_end;
405