1 /* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
22 
23 #include "my_config.h"
24 #include "mysqld_error.h"
25 #include <string>
26 #include <log.h>
27 #include <mecab.h>
28 #include <fts0tokenize.h>
29 
30 /* We are following InnoDB coding guidelines. */
31 
32 /** Global mecab objects shared by all threads. */
33 static MeCab::Model*	mecab_model = NULL;
34 static MeCab::Tagger*	mecab_tagger = NULL;
35 
36 /** Mecab charset. */
37 static char	mecab_charset[64];
38 
39 /** Mecab rc file path. */
40 static char*	mecab_rc_file;
41 
42 static const char*	mecab_min_supported_version = "0.993";
43 static const char*	mecab_max_supported_version = "0.996";
44 
45 #if defined(BUNDLE_MECAB)
46 static const bool bundle_mecab= true;
47 #else
48 static const bool bundle_mecab= false;
49 #endif
50 
51 /** Set MeCab parser charset.
52 @param[in]	charset charset string
53 @retval	true	on success
54 @retval	false	on failure */
55 static
56 bool
mecab_parser_check_and_set_charset(const char * charset)57 mecab_parser_check_and_set_charset(
58 	const char*	charset)
59 {
60 	/* Array used to map mecab charset to mysql charset. */
61 	static const int	mecab_charset_count = 4;
62 	static const char*	mecab_charset_values[mecab_charset_count][2] = {
63 		{"euc-jp",	"ujis"},
64 		{"sjis",	"sjis"},
65 		{"utf-8",	"utf8"},
66 		{"utf8",	"utf8"}
67 	};
68 
69 	for (int i = 0; i < mecab_charset_count; i++) {
70 		if (native_strcasecmp(charset, mecab_charset_values[i][0])
71 		    == 0 ) {
72 			strcpy(mecab_charset, mecab_charset_values[i][1]);
73 			return(true);
74 		}
75 	}
76 
77 	return(false);
78 }
79 
80 /** MeCab parser plugin initialization.
81 @retval 0 on success
82 @retval 1 on failure. */
83 static
84 int
mecab_parser_plugin_init(void *)85 mecab_parser_plugin_init(void*)
86 {
87 	const MeCab::DictionaryInfo*	mecab_dict;
88 
89 	/* Check mecab version. */
90 	if (strcmp(MeCab::Model::version(), mecab_min_supported_version) < 0) {
91 		sql_print_error("Mecab v%s is not supported,"
92 				" the lowest version supported is v%s.",
93 				MeCab::Model::version(),
94 				mecab_min_supported_version);
95 		return(1);
96 	}
97 
98 	if (strcmp(MeCab::Model::version(), mecab_max_supported_version) > 0) {
99 		sql_print_warning("Mecab v%s is not verified,"
100 				  " the highest version supported is v%s.",
101 				  MeCab::Model::version(),
102 				  mecab_max_supported_version);
103 	}
104 
105 	if (mecab_rc_file != NULL) {
106 		std::string	rcfile_arg;
107 
108 		/* See src/tagger.cpp for available options.
109 		--rcfile=<mecabrc file>  "use FILE as resource file" */
110 		rcfile_arg += "--rcfile=";
111 		rcfile_arg += mecab_rc_file;
112 
113 		/* It seems we *must* have some kind of mecabrc
114 		file available before calling createModel, see
115 		load_dictionary_resource() in  src/utils.cpp */
116 		sql_print_information("Mecab: Trying createModel(%s)",
117 				      rcfile_arg.c_str());
118 
119 		mecab_model = MeCab::createModel(rcfile_arg.c_str());
120 	} else {
121 		sql_print_information("Mecab: Trying createModel()");
122 		mecab_model = MeCab::createModel("");
123 	}
124 
125 	if (mecab_model == NULL) {
126 		sql_print_error("Mecab: createModel() failed: %s",
127 				MeCab::getLastError());
128 		return(1);
129 	}
130 
131 	mecab_tagger = mecab_model->createTagger();
132 	if (mecab_tagger == NULL) {
133 		sql_print_error("Mecab: createTagger() failed: %s",
134 				MeCab::getLastError());
135 		delete mecab_model;
136 		mecab_model= NULL;
137 		return(1);
138 	}
139 
140 	mecab_dict = mecab_model->dictionary_info();
141 	mecab_charset[0] = '\0';
142 	if (!mecab_parser_check_and_set_charset(mecab_dict->charset)) {
143 		delete mecab_tagger;
144 		mecab_tagger = NULL;
145 
146 		sql_print_error("Mecab: Unsupported dictionary charset %s",
147 				mecab_dict->charset);
148 
149 		delete mecab_model;
150 		mecab_model = NULL;
151 
152 		return(1);
153 	} else {
154 		sql_print_information("Mecab: Loaded dictionary charset is %s",
155 				      mecab_dict->charset);
156 		return(0);
157 	}
158 }
159 
160 /** MeCab parser plugin deinit
161 @retval	0 */
162 static
163 int
mecab_parser_plugin_deinit(void *)164 mecab_parser_plugin_deinit(void*)
165 {
166 	delete mecab_tagger;
167 	mecab_tagger = NULL;
168 
169 	delete mecab_model;
170 	mecab_model = NULL;
171 
172 	return(0);
173 }
174 
175 /** Parse a document by MeCab.
176 @param[in]	mecab_lattice	mecab lattice
177 @param[in]	param		plugin parser param
178 @param[in]	doc		document to parse
179 @param[in]	len		document length
180 @param[in,out]	bool_info	boolean info
181 @retvat	0	on success
182 @retval	1	on failure. */
183 static
184 int
mecab_parse(MeCab::Lattice * mecab_lattice,MYSQL_FTPARSER_PARAM * param,char * doc,int len,MYSQL_FTPARSER_BOOLEAN_INFO * bool_info)185 mecab_parse(
186 	MeCab::Lattice*		mecab_lattice,
187 	MYSQL_FTPARSER_PARAM*	param,
188 	char*			doc,
189 	int			len,
190 	MYSQL_FTPARSER_BOOLEAN_INFO*
191 				bool_info)
192 {
193 	static MYSQL_FTPARSER_BOOLEAN_INFO token_info =
194 		{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
195 	int	position = 0;
196 	int	token_num = 0;
197 	int	ret = 0;
198 	bool	term_converted = false;
199 
200 	try {
201 		mecab_lattice->set_sentence(doc, len);
202 
203 		if(!mecab_tagger->parse(mecab_lattice)) {
204 			sql_print_error("Mecab: parse() failed: %s",
205 					mecab_lattice->what());
206 			return(1);
207 		}
208 	} catch (std::bad_alloc const &) {
209 		sql_print_error("Mecab: parse() failed: out of memory.");
210 
211 		return(1);
212 	}
213 
214 	if (param->mode == MYSQL_FTPARSER_FULL_BOOLEAN_INFO) {
215 		for (const MeCab::Node* node = mecab_lattice->bos_node();
216 		     node != NULL; node = node->next) {
217 			token_num += 1;
218 		}
219 
220 		/* If the term has more than one token, convert it to a phrase.*/
221 		if (bool_info->quot == NULL && token_num > 1) {
222 			term_converted = true;
223 
224 			bool_info->type = FT_TOKEN_LEFT_PAREN;
225 			bool_info->quot = reinterpret_cast<char*>(1);
226 
227 			ret = param->mysql_add_word(param, NULL, 0, bool_info);
228 			if (ret != 0) {
229 				return(ret);
230 			}
231 		}
232 	}
233 
234 	for (const MeCab::Node* node = mecab_lattice->bos_node();
235 	     node != NULL; node = node->next) {
236 		bool_info->position = position;
237 		position += node->rlength;
238 
239 		param->mysql_add_word(param, const_cast<char*>(node->surface),
240 				      node->length,
241 				      term_converted ? &token_info : bool_info);
242 	}
243 
244 	if (term_converted) {
245 		bool_info->type = FT_TOKEN_RIGHT_PAREN;
246 		ret = param->mysql_add_word(param, NULL, 0, bool_info);
247 
248 		assert(bool_info->quot == NULL);
249 		bool_info->type = FT_TOKEN_WORD;
250 	}
251 
252 	return(ret);
253 }
254 
255 /** MeCab parser parse a document.
256 @param[in]	param	plugin parser param
257 @retval	0	on success
258 @retval	1	on failure. */
259 static
260 int
mecab_parser_parse(MYSQL_FTPARSER_PARAM * param)261 mecab_parser_parse(
262 	MYSQL_FTPARSER_PARAM*	param)
263 {
264 	MeCab::Lattice*			mecab_lattice = NULL;
265 	MYSQL_FTPARSER_BOOLEAN_INFO	bool_info =
266 		{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
267 	int		ret = 0;
268 	const char*	csname = NULL;
269 
270 	/* Mecab supports utf8mb4(utf8), eucjpms(ujis) and cp932(sjis). */
271 	if (strcmp(param->cs->csname, MY_UTF8MB4) == 0) {
272 		csname = "utf8";
273 	} else if (strcmp(param->cs->csname, "eucjpms") == 0) {
274 		csname = "ujis";
275 	} else if (strcmp(param->cs->csname, "cp932") == 0) {
276 		csname = "sjis";
277 	} else {
278 		csname = param->cs->csname;
279 	}
280 
281 	/* Check charset */
282 	if (strcmp(mecab_charset, csname) != 0) {
283 		char	error_msg[128];
284 
285 		my_snprintf(error_msg, 127, "Fulltext index charset '%s'"
286 			    " doesn't match mecab charset '%s'.",
287 			    param->cs->csname, mecab_charset);
288 		my_message(ER_ERROR_ON_WRITE, error_msg, MYF(0));
289 
290 		return(1);
291 	}
292 
293 	assert(param->cs->mbminlen == 1);
294 
295 	/* Create mecab lattice for parsing */
296 	mecab_lattice = mecab_model->createLattice();
297 	if (mecab_lattice == NULL) {
298 		sql_print_error("Mecab: createLattice() failed: %s",
299 				MeCab::getLastError());
300 		return(1);
301 	}
302 
303 	/* Allocate a new string with '\0' in the end to avoid
304 	valgrind error "Invalid read of size 1" in mecab. */
305 	assert(param->length >= 0);
306 	int	doc_length = param->length;
307 	char*	doc = reinterpret_cast<char*>(malloc(doc_length + 1));
308 
309 	if (doc == NULL) {
310 		my_error(ER_OUTOFMEMORY, MYF(0), doc_length);
311 		return(1);
312 	}
313 
314 	memcpy(doc, param->doc, doc_length);
315 	doc[doc_length]= '\0';
316 
317 	switch(param->mode) {
318 	case MYSQL_FTPARSER_SIMPLE_MODE:
319 	case MYSQL_FTPARSER_WITH_STOPWORDS:
320 		ret = mecab_parse(mecab_lattice, param, doc,
321 				  doc_length, &bool_info);
322 
323 		break;
324 
325 	case MYSQL_FTPARSER_FULL_BOOLEAN_INFO:
326 		uchar*		start = reinterpret_cast<uchar*>(doc);
327 		uchar*		end = start + doc_length;
328 		FT_WORD		word = {NULL, 0, 0};
329 
330 		while (fts_get_word(param->cs, &start, end, &word, &bool_info)) {
331 			/* Don't convert term with wildcard. */
332 			if (bool_info.type == FT_TOKEN_WORD
333 			    && !bool_info.trunc) {
334 				ret = mecab_parse(
335 					mecab_lattice,
336 					param,
337 					reinterpret_cast<char*>(word.pos),
338 					word.len,
339 					&bool_info);
340 			} else {
341 				ret = param->mysql_add_word(
342 					param,
343 					reinterpret_cast<char*>(word.pos),
344 					word.len,
345 					&bool_info);
346 			}
347 
348 			if (ret != 0) {
349 				break;
350 			}
351 		}
352 	}
353 
354 	free(doc);
355 	delete mecab_lattice;
356 
357 	return(ret);
358 }
359 
360 /** Fulltext MeCab Parser Descriptor*/
361 static struct st_mysql_ftparser mecab_parser_descriptor =
362 {
363 	MYSQL_FTPARSER_INTERFACE_VERSION,
364 	mecab_parser_parse,
365 	0,
366 	0
367 };
368 
369 /* MeCab plugin status variables */
370 static struct st_mysql_show_var mecab_status[] =
371 {
372 	{"mecab_charset", mecab_charset, SHOW_CHAR, SHOW_SCOPE_GLOBAL},
373 	{0, 0, enum_mysql_show_type(0), SHOW_SCOPE_GLOBAL}
374 };
375 
376 static MYSQL_SYSVAR_STR(rc_file, mecab_rc_file,
377   PLUGIN_VAR_READONLY,
378   "MECABRC file path",
379   NULL, NULL, NULL);
380 
381 /* MeCab plugin system variables */
382 static struct st_mysql_sys_var* mecab_system_variables[]= {
383 	MYSQL_SYSVAR(rc_file),
384 	NULL
385 };
386 
387 /* MeCab plugin descriptor */
mysql_declare_plugin(mecab_parser)388 mysql_declare_plugin(mecab_parser)
389 {
390 	MYSQL_FTPARSER_PLUGIN,		/*!< type	*/
391 	&mecab_parser_descriptor,	/*!< descriptor	*/
392 	"mecab",			/*!< name	*/
393 	"Oracle Corp",			/*!< author	*/
394 	"Mecab Full-Text Parser for Japanese",	/*!< description*/
395 	PLUGIN_LICENSE_GPL,		/*!< license	*/
396 	mecab_parser_plugin_init,	/*!< init function (when loaded)*/
397 	mecab_parser_plugin_deinit,	/*!< deinit function (when unloaded)*/
398 	0x0001,				/*!< version	*/
399 	mecab_status,			/*!< status variables	*/
400 	mecab_system_variables,		/*!< system variables	*/
401 	NULL,
402 	0,
403 }
404 mysql_declare_plugin_end;
405