1 /* Copyright (c) 2014, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
22 
23 #include "my_config.h"
24 #include "mysqld_error.h"
25 #include <string>
26 #include <log.h>
27 #include <mecab.h>
28 #include <fts0tokenize.h>
29 
30 /* We are following InnoDB coding guidelines. */
31 
32 /** Global mecab objects shared by all threads. */
33 static MeCab::Model*	mecab_model = NULL;
34 static MeCab::Tagger*	mecab_tagger = NULL;
35 
36 /** Mecab charset. */
37 static char	mecab_charset[64];
38 
39 /** Mecab rc file path. */
40 static char*	mecab_rc_file;
41 
42 static const char*	mecab_min_supported_version = "0.993";
43 static const char*	mecab_max_supported_version = "0.996";
44 
45 /** Set MeCab parser charset.
46 @param[in]	charset charset string
47 @retval	true	on success
48 @retval	false	on failure */
49 static
50 bool
mecab_parser_check_and_set_charset(const char * charset)51 mecab_parser_check_and_set_charset(
52 	const char*	charset)
53 {
54 	/* Array used to map mecab charset to mysql charset. */
55 	static const int	mecab_charset_count = 4;
56 	static const char*	mecab_charset_values[mecab_charset_count][2] = {
57 		{"euc-jp",	"ujis"},
58 		{"sjis",	"sjis"},
59 		{"utf-8",	"utf8"},
60 		{"utf8",	"utf8"}
61 	};
62 
63 	for (int i = 0; i < mecab_charset_count; i++) {
64 		if (native_strcasecmp(charset, mecab_charset_values[i][0])
65 		    == 0 ) {
66 			strcpy(mecab_charset, mecab_charset_values[i][1]);
67 			return(true);
68 		}
69 	}
70 
71 	return(false);
72 }
73 
74 /** MeCab parser plugin initialization.
75 @retval 0 on success
76 @retval 1 on failure. */
77 static
78 int
mecab_parser_plugin_init(void *)79 mecab_parser_plugin_init(void*)
80 {
81 	const MeCab::DictionaryInfo*	mecab_dict;
82 
83 	/* Check mecab version. */
84 	if (strcmp(MeCab::Model::version(), mecab_min_supported_version) < 0) {
85 		sql_print_error("Mecab v%s is not supported,"
86 				" the lowest version supported is v%s.",
87 				MeCab::Model::version(),
88 				mecab_min_supported_version);
89 		return(1);
90 	}
91 
92 	if (strcmp(MeCab::Model::version(), mecab_max_supported_version) > 0) {
93 		sql_print_warning("Mecab v%s is not verified,"
94 				  " the highest version supported is v%s.",
95 				  MeCab::Model::version(),
96 				  mecab_max_supported_version);
97 	}
98 
99 	if (mecab_rc_file != NULL) {
100 		std::string	rcfile_arg;
101 
102 		/* See src/tagger.cpp for available options.
103 		--rcfile=<mecabrc file>  "use FILE as resource file" */
104 		rcfile_arg += "--rcfile=";
105 		rcfile_arg += mecab_rc_file;
106 
107 		/* It seems we *must* have some kind of mecabrc
108 		file available before calling createModel, see
109 		load_dictionary_resource() in  src/utils.cpp */
110 		sql_print_information("Mecab: Trying createModel(%s)",
111 				      rcfile_arg.c_str());
112 
113 		mecab_model = MeCab::createModel(rcfile_arg.c_str());
114 	} else {
115 		sql_print_information("Mecab: Trying createModel()");
116 		mecab_model = MeCab::createModel("");
117 	}
118 
119 	if (mecab_model == NULL) {
120 		sql_print_error("Mecab: createModel() failed: %s",
121 				MeCab::getLastError());
122 		return(1);
123 	}
124 
125 	mecab_tagger = mecab_model->createTagger();
126 	if (mecab_tagger == NULL) {
127 		sql_print_error("Mecab: createTagger() failed: %s",
128 				MeCab::getLastError());
129 		delete mecab_model;
130 		mecab_model= NULL;
131 		return(1);
132 	}
133 
134 	mecab_dict = mecab_model->dictionary_info();
135 	mecab_charset[0] = '\0';
136 	if (!mecab_parser_check_and_set_charset(mecab_dict->charset)) {
137 		delete mecab_tagger;
138 		mecab_tagger = NULL;
139 
140 		sql_print_error("Mecab: Unsupported dictionary charset %s",
141 				mecab_dict->charset);
142 
143 		delete mecab_model;
144 		mecab_model = NULL;
145 
146 		return(1);
147 	} else {
148 		sql_print_information("Mecab: Loaded dictionary charset is %s",
149 				      mecab_dict->charset);
150 		return(0);
151 	}
152 }
153 
154 /** MeCab parser plugin deinit
155 @retval	0 */
156 static
157 int
mecab_parser_plugin_deinit(void *)158 mecab_parser_plugin_deinit(void*)
159 {
160 	delete mecab_tagger;
161 	mecab_tagger = NULL;
162 
163 	delete mecab_model;
164 	mecab_model = NULL;
165 
166 	return(0);
167 }
168 
169 /** Parse a document by MeCab.
170 @param[in]	mecab_lattice	mecab lattice
171 @param[in]	param		plugin parser param
172 @param[in]	doc		document to parse
173 @param[in]	len		document length
174 @param[in,out]	bool_info	boolean info
175 @retvat	0	on success
176 @retval	1	on failure. */
177 static
178 int
mecab_parse(MeCab::Lattice * mecab_lattice,MYSQL_FTPARSER_PARAM * param,char * doc,int len,MYSQL_FTPARSER_BOOLEAN_INFO * bool_info)179 mecab_parse(
180 	MeCab::Lattice*		mecab_lattice,
181 	MYSQL_FTPARSER_PARAM*	param,
182 	char*			doc,
183 	int			len,
184 	MYSQL_FTPARSER_BOOLEAN_INFO*
185 				bool_info)
186 {
187 	static MYSQL_FTPARSER_BOOLEAN_INFO token_info =
188 		{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
189 	int	position = 0;
190 	int	token_num = 0;
191 	int	ret = 0;
192 	bool	term_converted = false;
193 	const CHARSET_INFO*	cs = param->cs;
194 	char*	end = const_cast<char*>(doc) + len;
195 
196 	try {
197 		mecab_lattice->set_sentence(doc, len);
198 
199 		if(!mecab_tagger->parse(mecab_lattice)) {
200 			sql_print_error("Mecab: parse() failed: %s",
201 					mecab_lattice->what());
202 			return(1);
203 		}
204 	} catch (std::bad_alloc const &) {
205 		sql_print_error("Mecab: parse() failed: out of memory.");
206 
207 		return(1);
208 	}
209 
210 	if (param->mode == MYSQL_FTPARSER_FULL_BOOLEAN_INFO) {
211 		for (const MeCab::Node* node = mecab_lattice->bos_node();
212 		     node != NULL; node = node->next) {
213 			token_num += 1;
214 		}
215 
216 		/* If the term has more than one token, convert it to a phrase.*/
217 		if (bool_info->quot == NULL && token_num > 1) {
218 			term_converted = true;
219 
220 			bool_info->type = FT_TOKEN_LEFT_PAREN;
221 			bool_info->quot = reinterpret_cast<char*>(1);
222 
223 			ret = param->mysql_add_word(param, NULL, 0, bool_info);
224 			if (ret != 0) {
225 				return(ret);
226 			}
227 		}
228 	}
229 
230 	for (const MeCab::Node* node = mecab_lattice->bos_node();
231 	     node != NULL; node = node->next) {
232 		int ctype = 0;
233 		cs->cset->ctype(cs, &ctype, reinterpret_cast<const uchar *>(node->surface),
234 		                reinterpret_cast<const uchar *>(end));
235 
236 		/* Skip control characters */
237 		if (!(ctype & _MY_CTR)) {
238 			bool_info->position = position;
239 			position += node->rlength;
240 
241 			param->mysql_add_word(param, const_cast<char *>(node->surface),
242 														node->length,
243 														term_converted ? &token_info : bool_info);
244 		}
245 	}
246 
247 	if (term_converted) {
248 		bool_info->type = FT_TOKEN_RIGHT_PAREN;
249 		ret = param->mysql_add_word(param, NULL, 0, bool_info);
250 
251 		assert(bool_info->quot == NULL);
252 		bool_info->type = FT_TOKEN_WORD;
253 	}
254 
255 	return(ret);
256 }
257 
258 /** MeCab parser parse a document.
259 @param[in]	param	plugin parser param
260 @retval	0	on success
261 @retval	1	on failure. */
262 static
263 int
mecab_parser_parse(MYSQL_FTPARSER_PARAM * param)264 mecab_parser_parse(
265 	MYSQL_FTPARSER_PARAM*	param)
266 {
267 	MeCab::Lattice*			mecab_lattice = NULL;
268 	MYSQL_FTPARSER_BOOLEAN_INFO	bool_info =
269 		{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0};
270 	int		ret = 0;
271 	const char*	csname = NULL;
272 
273 	/* Mecab supports utf8mb4(utf8), eucjpms(ujis) and cp932(sjis). */
274 	if (strcmp(param->cs->csname, MY_UTF8MB4) == 0) {
275 		csname = "utf8";
276 	} else if (strcmp(param->cs->csname, "eucjpms") == 0) {
277 		csname = "ujis";
278 	} else if (strcmp(param->cs->csname, "cp932") == 0) {
279 		csname = "sjis";
280 	} else {
281 		csname = param->cs->csname;
282 	}
283 
284 	/* Check charset */
285 	if (strcmp(mecab_charset, csname) != 0) {
286 		char	error_msg[128];
287 
288 		my_snprintf(error_msg, 127, "Fulltext index charset '%s'"
289 			    " doesn't match mecab charset '%s'.",
290 			    param->cs->csname, mecab_charset);
291 		my_message(ER_ERROR_ON_WRITE, error_msg, MYF(0));
292 
293 		return(1);
294 	}
295 
296 	assert(param->cs->mbminlen == 1);
297 
298 	/* Create mecab lattice for parsing */
299 	mecab_lattice = mecab_model->createLattice();
300 	if (mecab_lattice == NULL) {
301 		sql_print_error("Mecab: createLattice() failed: %s",
302 				MeCab::getLastError());
303 		return(1);
304 	}
305 
306 	/* Allocate a new string with '\0' in the end to avoid
307 	valgrind error "Invalid read of size 1" in mecab. */
308 	assert(param->length >= 0);
309 	int	doc_length = param->length;
310 	char*	doc = reinterpret_cast<char*>(malloc(doc_length + 1));
311 
312 	if (doc == NULL) {
313 		my_error(ER_OUTOFMEMORY, MYF(0), doc_length);
314 		return(1);
315 	}
316 
317 	memcpy(doc, param->doc, doc_length);
318 	doc[doc_length]= '\0';
319 
320 	switch(param->mode) {
321 	case MYSQL_FTPARSER_SIMPLE_MODE:
322 	case MYSQL_FTPARSER_WITH_STOPWORDS:
323 		ret = mecab_parse(mecab_lattice, param, doc,
324 				  doc_length, &bool_info);
325 
326 		break;
327 
328 	case MYSQL_FTPARSER_FULL_BOOLEAN_INFO:
329 		uchar*		start = reinterpret_cast<uchar*>(doc);
330 		uchar*		end = start + doc_length;
331 		FT_WORD		word = {NULL, 0, 0};
332 		const bool	extra_word_chars =
333 			thd_get_ft_query_extra_word_chars();
334 
335 		while (fts_get_word(param->cs, extra_word_chars, &start, end,
336 				    &word, &bool_info)) {
337 			/* Don't convert term with wildcard. */
338 			if (bool_info.type == FT_TOKEN_WORD
339 			    && !bool_info.trunc) {
340 				ret = mecab_parse(
341 					mecab_lattice,
342 					param,
343 					reinterpret_cast<char*>(word.pos),
344 					word.len,
345 					&bool_info);
346 			} else {
347 				ret = param->mysql_add_word(
348 					param,
349 					reinterpret_cast<char*>(word.pos),
350 					word.len,
351 					&bool_info);
352 			}
353 
354 			if (ret != 0) {
355 				break;
356 			}
357 		}
358 	}
359 
360 	free(doc);
361 	delete mecab_lattice;
362 
363 	return(ret);
364 }
365 
366 /** Fulltext MeCab Parser Descriptor*/
367 static struct st_mysql_ftparser mecab_parser_descriptor =
368 {
369 	MYSQL_FTPARSER_INTERFACE_VERSION,
370 	mecab_parser_parse,
371 	0,
372 	0
373 };
374 
375 /* MeCab plugin status variables */
376 static struct st_mysql_show_var mecab_status[] =
377 {
378 	{"mecab_charset", mecab_charset, SHOW_CHAR, SHOW_SCOPE_GLOBAL},
379 	{0, 0, enum_mysql_show_type(0), SHOW_SCOPE_GLOBAL}
380 };
381 
382 static MYSQL_SYSVAR_STR(rc_file, mecab_rc_file,
383   PLUGIN_VAR_READONLY,
384   "MECABRC file path",
385   NULL, NULL, NULL);
386 
387 /* MeCab plugin system variables */
388 static struct st_mysql_sys_var* mecab_system_variables[]= {
389 	MYSQL_SYSVAR(rc_file),
390 	NULL
391 };
392 
393 /* MeCab plugin descriptor */
mysql_declare_plugin(mecab_parser)394 mysql_declare_plugin(mecab_parser)
395 {
396 	MYSQL_FTPARSER_PLUGIN,		/*!< type	*/
397 	&mecab_parser_descriptor,	/*!< descriptor	*/
398 	"mecab",			/*!< name	*/
399 	"Oracle Corp",			/*!< author	*/
400 	"Mecab Full-Text Parser for Japanese",	/*!< description*/
401 	PLUGIN_LICENSE_GPL,		/*!< license	*/
402 	mecab_parser_plugin_init,	/*!< init function (when loaded)*/
403 	mecab_parser_plugin_deinit,	/*!< deinit function (when unloaded)*/
404 	0x0001,				/*!< version	*/
405 	mecab_status,			/*!< status variables	*/
406 	mecab_system_variables,		/*!< system variables	*/
407 	NULL,
408 	0,
409 }
410 mysql_declare_plugin_end;
411