1 /*****************************************************************************
2
3 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2020, MariaDB Corporation.
5
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free Software
8 Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along with
15 this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17
18 *****************************************************************************/
19
20 /******************************************************************//**
21 @file fts/fts0tokenize.cc
22 Full Text Search plugin tokenizer refer to MyISAM
23
24 Created 2014/11/17 Shaohua Wang
25 ***********************************************************************/
26
27 #include "ft_global.h"
28 #include "mysql/plugin_ftparser.h"
29 #include "m_ctype.h"
30
31 /* Macros and structs below are from ftdefs.h in MyISAM */
32 /** Check a char is true word */
33 #define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
34
35 /** Check if a char is misc word */
36 #define misc_word_char(X) 0
37
38 /** Boolean search syntax */
39 static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
40
41 #define FTB_YES (fts_boolean_syntax[0])
42 #define FTB_EGAL (fts_boolean_syntax[1])
43 #define FTB_NO (fts_boolean_syntax[2])
44 #define FTB_INC (fts_boolean_syntax[3])
45 #define FTB_DEC (fts_boolean_syntax[4])
46 #define FTB_LBR (fts_boolean_syntax[5])
47 #define FTB_RBR (fts_boolean_syntax[6])
48 #define FTB_NEG (fts_boolean_syntax[7])
49 #define FTB_TRUNC (fts_boolean_syntax[8])
50 #define FTB_LQUOT (fts_boolean_syntax[10])
51 #define FTB_RQUOT (fts_boolean_syntax[11])
52
53 /** FTS query token */
54 typedef struct st_ft_word {
55 uchar* pos; /*!< word start pointer */
56 uint len; /*!< word len */
57 double weight; /*!< word weight, unused in innodb */
58 } FT_WORD;
59
60 /** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
61 Differences: a. code format changed; b. stopword processing removed.
62 @param[in] cs charset
63 @param[in,out] start doc start pointer
64 @param[in,out] end doc end pointer
65 @param[in,out] word token
66 @param[in,out] info token info
67 @retval 0 eof
68 @retval 1 word found
69 @retval 2 left bracket
70 @retval 3 right bracket
71 @retval 4 stopword found */
72 inline
73 uchar
fts_get_word(const CHARSET_INFO * cs,uchar ** start,uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * info)74 fts_get_word(
75 const CHARSET_INFO* cs,
76 uchar** start,
77 uchar* end,
78 FT_WORD* word,
79 MYSQL_FTPARSER_BOOLEAN_INFO*
80 info)
81 {
82 uchar* doc = *start;
83 int ctype;
84 uint mwc;
85 uint length;
86 int mbl;
87
88 info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
89 info->weight_adjust = info->wasign = 0;
90 info->type = FT_TOKEN_EOF;
91
92 while (doc < end) {
93 for (; doc < end;
94 doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
95 mbl = cs->ctype(&ctype, doc, end);
96
97 if (true_word_char(ctype, *doc)) {
98 break;
99 }
100
101 if (*doc == FTB_RQUOT && info->quot) {
102 *start = doc + 1;
103 info->type = FT_TOKEN_RIGHT_PAREN;
104
105 return(info->type);
106 }
107
108 if (!info->quot) {
109 if (*doc == FTB_LBR
110 || *doc == FTB_RBR
111 || *doc == FTB_LQUOT) {
112 /* param->prev=' '; */
113 *start = doc + 1;
114 if (*doc == FTB_LQUOT) {
115 info->quot = (char*)1;
116 }
117
118 info->type = (*doc == FTB_RBR ?
119 FT_TOKEN_RIGHT_PAREN :
120 FT_TOKEN_LEFT_PAREN);
121
122 return(info->type);
123 }
124
125 if (info->prev == ' ') {
126 if (*doc == FTB_YES) {
127 info->yesno = +1;
128 continue;
129 } else if (*doc == FTB_EGAL) {
130 info->yesno = 0;
131 continue;
132 } else if (*doc == FTB_NO) {
133 info->yesno = -1;
134 continue;
135 } else if (*doc == FTB_INC) {
136 info->weight_adjust++;
137 continue;
138 } else if (*doc == FTB_DEC) {
139 info->weight_adjust--;
140 continue;
141 } else if (*doc == FTB_NEG) {
142 info->wasign = !info->wasign;
143 continue;
144 }
145 }
146 }
147
148 info->prev = char(*doc);
149 info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
150 info->weight_adjust = info->wasign = 0;
151 }
152
153 mwc = length = 0;
154 for (word->pos = doc;
155 doc < end;
156 length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
157 mbl = cs->ctype(&ctype, doc, end);
158
159 if (true_word_char(ctype, *doc)) {
160 mwc = 0;
161 } else if (!misc_word_char(*doc) || mwc) {
162 break;
163 } else {
164 mwc++;
165 }
166 }
167
168 /* Be sure *prev is true_word_char. */
169 info->prev = 'A';
170 word->len = (uint)(doc-word->pos) - mwc;
171
172 if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
173 doc++;
174 }
175
176 /* We don't check stopword here. */
177 *start = doc;
178 info->type = FT_TOKEN_WORD;
179
180 return(info->type);
181 }
182
183 if (info->quot) {
184 *start = doc;
185 info->type = FT_TOKEN_RIGHT_PAREN;
186 }
187
188 return(info->type);
189 }
190