1 /* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 #include "my_config.h"
24 #include "mysqld_error.h"
25 #include <string>
26 #include <log.h>
27 #include <mecab.h>
28 #include <fts0tokenize.h>
29
30 /* We are following InnoDB coding guidelines. */
31
32 /** Global mecab objects shared by all threads. */
33 static MeCab::Model* mecab_model = NULL;
34 static MeCab::Tagger* mecab_tagger = NULL;
35
36 /** Mecab charset. */
37 static char mecab_charset[64];
38
39 /** Mecab rc file path. */
40 static char* mecab_rc_file;
41
42 static const char* mecab_min_supported_version = "0.993";
43 static const char* mecab_max_supported_version = "0.996";
44
45 /** Set MeCab parser charset.
46 @param[in] charset charset string
47 @retval true on success
48 @retval false on failure */
49 static
50 bool
mecab_parser_check_and_set_charset(const char * charset)51 mecab_parser_check_and_set_charset(
52 const char* charset)
53 {
54 /* Array used to map mecab charset to mysql charset. */
55 static const int mecab_charset_count = 4;
56 static const char* mecab_charset_values[mecab_charset_count][2] = {
57 {"euc-jp", "ujis"},
58 {"sjis", "sjis"},
59 {"utf-8", "utf8"},
60 {"utf8", "utf8"}
61 };
62
63 for (int i = 0; i < mecab_charset_count; i++) {
64 if (native_strcasecmp(charset, mecab_charset_values[i][0])
65 == 0 ) {
66 strcpy(mecab_charset, mecab_charset_values[i][1]);
67 return(true);
68 }
69 }
70
71 return(false);
72 }
73
74 /** MeCab parser plugin initialization.
75 @retval 0 on success
76 @retval 1 on failure. */
77 static
78 int
mecab_parser_plugin_init(void *)79 mecab_parser_plugin_init(void*)
80 {
81 const MeCab::DictionaryInfo* mecab_dict;
82
83 /* Check mecab version. */
84 if (strcmp(MeCab::Model::version(), mecab_min_supported_version) < 0) {
85 sql_print_error("Mecab v%s is not supported,"
86 " the lowest version supported is v%s.",
87 MeCab::Model::version(),
88 mecab_min_supported_version);
89 return(1);
90 }
91
92 if (strcmp(MeCab::Model::version(), mecab_max_supported_version) > 0) {
93 sql_print_warning("Mecab v%s is not verified,"
94 " the highest version supported is v%s.",
95 MeCab::Model::version(),
96 mecab_max_supported_version);
97 }
98
99 if (mecab_rc_file != NULL) {
100 std::string rcfile_arg;
101
102 /* See src/tagger.cpp for available options.
103 --rcfile=<mecabrc file> "use FILE as resource file" */
104 rcfile_arg += "--rcfile=";
105 rcfile_arg += mecab_rc_file;
106
107 /* It seems we *must* have some kind of mecabrc
108 file available before calling createModel, see
109 load_dictionary_resource() in src/utils.cpp */
110 sql_print_information("Mecab: Trying createModel(%s)",
111 rcfile_arg.c_str());
112
113 mecab_model = MeCab::createModel(rcfile_arg.c_str());
114 } else {
115 sql_print_information("Mecab: Trying createModel()");
116 mecab_model = MeCab::createModel("");
117 }
118
119 if (mecab_model == NULL) {
120 sql_print_error("Mecab: createModel() failed: %s",
121 MeCab::getLastError());
122 return(1);
123 }
124
125 mecab_tagger = mecab_model->createTagger();
126 if (mecab_tagger == NULL) {
127 sql_print_error("Mecab: createTagger() failed: %s",
128 MeCab::getLastError());
129 delete mecab_model;
130 mecab_model= NULL;
131 return(1);
132 }
133
134 mecab_dict = mecab_model->dictionary_info();
135 mecab_charset[0] = '\0';
136 if (!mecab_parser_check_and_set_charset(mecab_dict->charset)) {
137 delete mecab_tagger;
138 mecab_tagger = NULL;
139
140 sql_print_error("Mecab: Unsupported dictionary charset %s",
141 mecab_dict->charset);
142
143 delete mecab_model;
144 mecab_model = NULL;
145
146 return(1);
147 } else {
148 sql_print_information("Mecab: Loaded dictionary charset is %s",
149 mecab_dict->charset);
150 return(0);
151 }
152 }
153
154 /** MeCab parser plugin deinit
155 @retval 0 */
156 static
157 int
mecab_parser_plugin_deinit(void *)158 mecab_parser_plugin_deinit(void*)
159 {
160 delete mecab_tagger;
161 mecab_tagger = NULL;
162
163 delete mecab_model;
164 mecab_model = NULL;
165
166 return(0);
167 }
168
169 /** Parse a document by MeCab.
170 @param[in] mecab_lattice mecab lattice
171 @param[in] param plugin parser param
172 @param[in] doc document to parse
173 @param[in] len document length
174 @param[in,out] bool_info boolean info
175 @retvat 0 on success
176 @retval 1 on failure. */
177 static
178 int
mecab_parse(MeCab::Lattice * mecab_lattice,MYSQL_FTPARSER_PARAM * param,char * doc,int len,MYSQL_FTPARSER_BOOLEAN_INFO * bool_info)179 mecab_parse(
180 MeCab::Lattice* mecab_lattice,
181 MYSQL_FTPARSER_PARAM* param,
182 char* doc,
183 int len,
184 MYSQL_FTPARSER_BOOLEAN_INFO*
185 bool_info)
186 {
187 static MYSQL_FTPARSER_BOOLEAN_INFO token_info =
188 { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
189 int position = 0;
190 int token_num = 0;
191 int ret = 0;
192 bool term_converted = false;
193 const CHARSET_INFO* cs = param->cs;
194 char* end = const_cast<char*>(doc) + len;
195
196 try {
197 mecab_lattice->set_sentence(doc, len);
198
199 if(!mecab_tagger->parse(mecab_lattice)) {
200 sql_print_error("Mecab: parse() failed: %s",
201 mecab_lattice->what());
202 return(1);
203 }
204 } catch (std::bad_alloc const &) {
205 sql_print_error("Mecab: parse() failed: out of memory.");
206
207 return(1);
208 }
209
210 if (param->mode == MYSQL_FTPARSER_FULL_BOOLEAN_INFO) {
211 for (const MeCab::Node* node = mecab_lattice->bos_node();
212 node != NULL; node = node->next) {
213 token_num += 1;
214 }
215
216 /* If the term has more than one token, convert it to a phrase.*/
217 if (bool_info->quot == NULL && token_num > 1) {
218 term_converted = true;
219
220 bool_info->type = FT_TOKEN_LEFT_PAREN;
221 bool_info->quot = reinterpret_cast<char*>(1);
222
223 ret = param->mysql_add_word(param, NULL, 0, bool_info);
224 if (ret != 0) {
225 return(ret);
226 }
227 }
228 }
229
230 for (const MeCab::Node* node = mecab_lattice->bos_node();
231 node != NULL; node = node->next) {
232 int ctype = 0;
233 cs->cset->ctype(cs, &ctype, reinterpret_cast<const uchar *>(node->surface),
234 reinterpret_cast<const uchar *>(end));
235
236 /* Skip control characters */
237 if (!(ctype & _MY_CTR)) {
238 bool_info->position = position;
239 position += node->rlength;
240
241 param->mysql_add_word(param, const_cast<char *>(node->surface),
242 node->length,
243 term_converted ? &token_info : bool_info);
244 }
245 }
246
247 if (term_converted) {
248 bool_info->type = FT_TOKEN_RIGHT_PAREN;
249 ret = param->mysql_add_word(param, NULL, 0, bool_info);
250
251 assert(bool_info->quot == NULL);
252 bool_info->type = FT_TOKEN_WORD;
253 }
254
255 return(ret);
256 }
257
258 /** MeCab parser parse a document.
259 @param[in] param plugin parser param
260 @retval 0 on success
261 @retval 1 on failure. */
262 static
263 int
mecab_parser_parse(MYSQL_FTPARSER_PARAM * param)264 mecab_parser_parse(
265 MYSQL_FTPARSER_PARAM* param)
266 {
267 MeCab::Lattice* mecab_lattice = NULL;
268 MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
269 { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
270 int ret = 0;
271 const char* csname = NULL;
272
273 /* Mecab supports utf8mb4(utf8), eucjpms(ujis) and cp932(sjis). */
274 if (strcmp(param->cs->csname, MY_UTF8MB4) == 0) {
275 csname = "utf8";
276 } else if (strcmp(param->cs->csname, "eucjpms") == 0) {
277 csname = "ujis";
278 } else if (strcmp(param->cs->csname, "cp932") == 0) {
279 csname = "sjis";
280 } else {
281 csname = param->cs->csname;
282 }
283
284 /* Check charset */
285 if (strcmp(mecab_charset, csname) != 0) {
286 char error_msg[128];
287
288 my_snprintf(error_msg, 127, "Fulltext index charset '%s'"
289 " doesn't match mecab charset '%s'.",
290 param->cs->csname, mecab_charset);
291 my_message(ER_ERROR_ON_WRITE, error_msg, MYF(0));
292
293 return(1);
294 }
295
296 assert(param->cs->mbminlen == 1);
297
298 /* Create mecab lattice for parsing */
299 mecab_lattice = mecab_model->createLattice();
300 if (mecab_lattice == NULL) {
301 sql_print_error("Mecab: createLattice() failed: %s",
302 MeCab::getLastError());
303 return(1);
304 }
305
306 /* Allocate a new string with '\0' in the end to avoid
307 valgrind error "Invalid read of size 1" in mecab. */
308 assert(param->length >= 0);
309 int doc_length = param->length;
310 char* doc = reinterpret_cast<char*>(malloc(doc_length + 1));
311
312 if (doc == NULL) {
313 my_error(ER_OUTOFMEMORY, MYF(0), doc_length);
314 return(1);
315 }
316
317 memcpy(doc, param->doc, doc_length);
318 doc[doc_length]= '\0';
319
320 switch(param->mode) {
321 case MYSQL_FTPARSER_SIMPLE_MODE:
322 case MYSQL_FTPARSER_WITH_STOPWORDS:
323 ret = mecab_parse(mecab_lattice, param, doc,
324 doc_length, &bool_info);
325
326 break;
327
328 case MYSQL_FTPARSER_FULL_BOOLEAN_INFO:
329 uchar* start = reinterpret_cast<uchar*>(doc);
330 uchar* end = start + doc_length;
331 FT_WORD word = {NULL, 0, 0};
332 const bool extra_word_chars =
333 thd_get_ft_query_extra_word_chars();
334
335 while (fts_get_word(param->cs, extra_word_chars, &start, end,
336 &word, &bool_info)) {
337 /* Don't convert term with wildcard. */
338 if (bool_info.type == FT_TOKEN_WORD
339 && !bool_info.trunc) {
340 ret = mecab_parse(
341 mecab_lattice,
342 param,
343 reinterpret_cast<char*>(word.pos),
344 word.len,
345 &bool_info);
346 } else {
347 ret = param->mysql_add_word(
348 param,
349 reinterpret_cast<char*>(word.pos),
350 word.len,
351 &bool_info);
352 }
353
354 if (ret != 0) {
355 break;
356 }
357 }
358 }
359
360 free(doc);
361 delete mecab_lattice;
362
363 return(ret);
364 }
365
366 /** Fulltext MeCab Parser Descriptor*/
367 static struct st_mysql_ftparser mecab_parser_descriptor =
368 {
369 MYSQL_FTPARSER_INTERFACE_VERSION,
370 mecab_parser_parse,
371 0,
372 0
373 };
374
375 /* MeCab plugin status variables */
376 static struct st_mysql_show_var mecab_status[] =
377 {
378 {"mecab_charset", mecab_charset, SHOW_CHAR, SHOW_SCOPE_GLOBAL},
379 {0, 0, enum_mysql_show_type(0), SHOW_SCOPE_GLOBAL}
380 };
381
382 static MYSQL_SYSVAR_STR(rc_file, mecab_rc_file,
383 PLUGIN_VAR_READONLY,
384 "MECABRC file path",
385 NULL, NULL, NULL);
386
387 /* MeCab plugin system variables */
388 static struct st_mysql_sys_var* mecab_system_variables[]= {
389 MYSQL_SYSVAR(rc_file),
390 NULL
391 };
392
393 /* MeCab plugin descriptor */
mysql_declare_plugin(mecab_parser)394 mysql_declare_plugin(mecab_parser)
395 {
396 MYSQL_FTPARSER_PLUGIN, /*!< type */
397 &mecab_parser_descriptor, /*!< descriptor */
398 "mecab", /*!< name */
399 "Oracle Corp", /*!< author */
400 "Mecab Full-Text Parser for Japanese", /*!< description*/
401 PLUGIN_LICENSE_GPL, /*!< license */
402 mecab_parser_plugin_init, /*!< init function (when loaded)*/
403 mecab_parser_plugin_deinit, /*!< deinit function (when unloaded)*/
404 0x0001, /*!< version */
405 mecab_status, /*!< status variables */
406 mecab_system_variables, /*!< system variables */
407 NULL,
408 0,
409 }
410 mysql_declare_plugin_end;
411