1 /*****************************************************************************
2 
3 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2020, MariaDB Corporation.
5 
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 
18 *****************************************************************************/
19 
20 /******************************************************************//**
21 @file fts/fts0tokenize.cc
22 Full Text Search plugin tokenizer refer to MyISAM
23 
24 Created 2014/11/17 Shaohua Wang
25 ***********************************************************************/
26 
27 #include "ft_global.h"
28 #include "mysql/plugin_ftparser.h"
29 #include "m_ctype.h"
30 
31 /* Macros and structs below are from ftdefs.h in MyISAM */
32 /** Check a char is true word */
33 #define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
34 
35 /** Check if a char is misc word */
36 #define misc_word_char(X)       0
37 
38 /** Boolean search syntax */
39 static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
40 
41 #define FTB_YES   (fts_boolean_syntax[0])
42 #define FTB_EGAL  (fts_boolean_syntax[1])
43 #define FTB_NO    (fts_boolean_syntax[2])
44 #define FTB_INC   (fts_boolean_syntax[3])
45 #define FTB_DEC   (fts_boolean_syntax[4])
46 #define FTB_LBR   (fts_boolean_syntax[5])
47 #define FTB_RBR   (fts_boolean_syntax[6])
48 #define FTB_NEG   (fts_boolean_syntax[7])
49 #define FTB_TRUNC (fts_boolean_syntax[8])
50 #define FTB_LQUOT (fts_boolean_syntax[10])
51 #define FTB_RQUOT (fts_boolean_syntax[11])
52 
53 /** FTS query token */
54 typedef struct st_ft_word {
55         uchar* pos;     /*!< word start pointer */
56         uint   len;     /*!< word len */
57         double weight;  /*!< word weight, unused in innodb */
58 } FT_WORD;
59 
60 /** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
61 Differences: a. code format changed; b. stopword processing removed.
62 @param[in]	cs	charset
63 @param[in,out]	start	doc start pointer
64 @param[in,out]	end	doc end pointer
65 @param[in,out]	word	token
66 @param[in,out]	info	token info
67 @retval	0	eof
68 @retval	1	word found
69 @retval	2	left bracket
70 @retval	3	right bracket
71 @retval	4	stopword found */
72 inline
73 uchar
fts_get_word(const CHARSET_INFO * cs,uchar ** start,uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * info)74 fts_get_word(
75 	const CHARSET_INFO*	cs,
76 	uchar**			start,
77 	uchar*			end,
78 	FT_WORD*		word,
79 	MYSQL_FTPARSER_BOOLEAN_INFO*
80 				info)
81 {
82 	uchar*	doc = *start;
83 	int	ctype;
84 	uint	mwc;
85 	uint	length;
86 	int	mbl;
87 
88 	info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
89 	info->weight_adjust = info->wasign = 0;
90 	info->type = FT_TOKEN_EOF;
91 
92 	while (doc < end) {
93 		for (; doc < end;
94 		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
95 			mbl = cs->ctype(&ctype, doc, end);
96 
97 			if (true_word_char(ctype, *doc)) {
98 				break;
99 			}
100 
101 			if (*doc == FTB_RQUOT && info->quot) {
102 				*start = doc + 1;
103 				info->type = FT_TOKEN_RIGHT_PAREN;
104 
105 				return(info->type);
106 			}
107 
108 			if (!info->quot) {
109 				if (*doc == FTB_LBR
110 				    || *doc == FTB_RBR
111 				    || *doc == FTB_LQUOT) {
112 					/* param->prev=' '; */
113 					*start = doc + 1;
114 					if (*doc == FTB_LQUOT) {
115 						info->quot = (char*)1;
116 					}
117 
118 					info->type = (*doc == FTB_RBR ?
119 						       FT_TOKEN_RIGHT_PAREN :
120 						       FT_TOKEN_LEFT_PAREN);
121 
122 					return(info->type);
123 				}
124 
125 				if (info->prev == ' ') {
126 					if (*doc == FTB_YES) {
127 						info->yesno = +1;
128 						continue;
129 					} else if (*doc == FTB_EGAL) {
130 						info->yesno = 0;
131 						continue;
132 					} else if (*doc == FTB_NO) {
133 						info->yesno = -1;
134 						continue;
135 					} else if (*doc == FTB_INC) {
136 						info->weight_adjust++;
137 						continue;
138 					} else if (*doc == FTB_DEC) {
139 						info->weight_adjust--;
140 						continue;
141 					} else if (*doc == FTB_NEG) {
142 						info->wasign = !info->wasign;
143 						continue;
144 					}
145 				}
146 			}
147 
148 			info->prev = char(*doc);
149 			info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
150 			info->weight_adjust = info->wasign = 0;
151 		}
152 
153 		mwc = length = 0;
154 		for (word->pos = doc;
155 		     doc < end;
156 		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
157 			mbl = cs->ctype(&ctype, doc, end);
158 
159 			if (true_word_char(ctype, *doc)) {
160 				mwc = 0;
161 			} else if (!misc_word_char(*doc) || mwc) {
162 				break;
163 			} else {
164 				mwc++;
165 			}
166 		}
167 
168 		/* Be sure *prev is true_word_char. */
169 		info->prev = 'A';
170 		word->len = (uint)(doc-word->pos) - mwc;
171 
172 		if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
173 			doc++;
174 		}
175 
176 		/* We don't check stopword here. */
177 		*start = doc;
178 		info->type = FT_TOKEN_WORD;
179 
180 		return(info->type);
181 	}
182 
183 	if (info->quot) {
184 		*start = doc;
185 		info->type = FT_TOKEN_RIGHT_PAREN;
186 	}
187 
188 	return(info->type);
189 }
190