1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 Copyright (c) 2020, MariaDB Corporation.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
16
17 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
18
19 #include "ma_ftdefs.h"
20
21 typedef struct st_maria_ft_docstat {
22 FT_WORD *list;
23 uint uniq;
24 double sum;
25 } FT_DOCSTAT;
26
27
28 typedef struct st_my_maria_ft_parser_param
29 {
30 TREE *wtree;
31 MEM_ROOT *mem_root;
32 } MY_FT_PARSER_PARAM;
33
34
FT_WORD_cmp(CHARSET_INFO * cs,FT_WORD * w1,FT_WORD * w2)35 static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
36 {
37 return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
38 (uchar*) w2->pos, w2->len, 0);
39 }
40
walk_and_copy(FT_WORD * word,uint32 count,FT_DOCSTAT * docstat)41 static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
42 {
43 word->weight=LWS_IN_USE;
44 docstat->sum+=word->weight;
45 memcpy((docstat->list)++, word, sizeof(FT_WORD));
46 return 0;
47 }
48
49 /* transforms tree of words into the array, applying normalization */
50
maria_ft_linearize(TREE * wtree,MEM_ROOT * mem_root)51 FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
52 {
53 FT_WORD *wlist,*p;
54 FT_DOCSTAT docstat;
55 DBUG_ENTER("maria_ft_linearize");
56
57 if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
58 (1+wtree->elements_in_tree))))
59 {
60 docstat.list=wlist;
61 docstat.uniq=wtree->elements_in_tree;
62 docstat.sum=0;
63 tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
64 }
65 delete_tree(wtree, 0);
66 if (!wlist)
67 DBUG_RETURN(NULL);
68
69 docstat.list->pos=NULL;
70
71 for (p=wlist;p->pos;p++)
72 {
73 p->weight=PRENORM_IN_USE;
74 }
75
76 for (p=wlist;p->pos;p++)
77 {
78 p->weight/=NORM_IN_USE;
79 }
80
81 DBUG_RETURN(wlist);
82 }
83
maria_ft_boolean_check_syntax_string(const uchar * str)84 my_bool maria_ft_boolean_check_syntax_string(const uchar *str)
85 {
86 uint i, j;
87
88 if (!str ||
89 (strlen((const char *) str) + 1 != sizeof(ft_boolean_syntax)) ||
90 (str[0] != ' ' && str[1] != ' '))
91 return 1;
92 for (i=0; i<sizeof(ft_boolean_syntax); i++)
93 {
94 /* limiting to 7-bit ascii only */
95 if ((unsigned char)(str[i]) > 127 ||
96 my_isalnum(default_charset_info, str[i]))
97 return 1;
98 for (j=0; j<i; j++)
99 if (str[i] == str[j] && (i != 11 || j != 10))
100 return 1;
101 }
102 return 0;
103 }
104
105 /*
106 RETURN VALUE
107 0 - eof
108 1 - word found
109 2 - left bracket
110 3 - right bracket
111 4 - stopword found
112 */
maria_ft_get_word(CHARSET_INFO * cs,const uchar ** start,const uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * param)113 uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start,
114 const uchar *end,
115 FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
116 {
117 const uchar *doc= *start;
118 int ctype;
119 uint mwc, length;
120 int mbl;
121
122 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
123 param->weight_adjust= param->wasign= 0;
124 param->type= FT_TOKEN_EOF;
125
126 while (doc<end)
127 {
128 for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
129 {
130 mbl= my_ci_ctype(cs, &ctype, doc, end);
131 if (true_word_char(ctype, *doc))
132 break;
133 if (*doc == FTB_RQUOT && param->quot)
134 {
135 param->quot= (char *) doc;
136 *start=doc+1;
137 param->type= FT_TOKEN_RIGHT_PAREN;
138 goto ret;
139 }
140 if (!param->quot)
141 {
142 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
143 {
144 /* param->prev=' '; */
145 *start=doc+1;
146 if (*doc == FTB_LQUOT)
147 param->quot= (char *) *start;
148 param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
149 goto ret;
150 }
151 if (param->prev == ' ')
152 {
153 if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
154 if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
155 if (*doc == FTB_NO ) { param->yesno=-1; continue; } else
156 if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
157 if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
158 if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
159 }
160 }
161 param->prev=*doc;
162 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
163 param->weight_adjust= param->wasign= 0;
164 }
165
166 mwc=length=0;
167 for (word->pos= doc; doc < end; length++,
168 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
169 {
170 mbl= my_ci_ctype(cs, &ctype, doc, end);
171 if (true_word_char(ctype, *doc))
172 mwc=0;
173 else if (!misc_word_char(*doc) || mwc)
174 break;
175 else
176 mwc++;
177 }
178 param->prev='A'; /* be sure *prev is true_word_char */
179 word->len= (uint)(doc-word->pos) - mwc;
180 if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
181 doc++;
182
183 if (((length >= ft_min_word_len && !is_stopword((char *) word->pos,
184 word->len))
185 || param->trunc) && length < ft_max_word_len)
186 {
187 *start=doc;
188 param->type= FT_TOKEN_WORD;
189 goto ret;
190 }
191 else if (length) /* make sure length > 0 (if start contains spaces only) */
192 {
193 *start= doc;
194 param->type= FT_TOKEN_STOPWORD;
195 goto ret;
196 }
197 }
198 if (param->quot)
199 {
200 param->quot= (char *)(*start= doc);
201 param->type= 3; /* FT_RBR */
202 goto ret;
203 }
204 ret:
205 return param->type;
206 }
207
maria_ft_simple_get_word(CHARSET_INFO * cs,uchar ** start,const uchar * end,FT_WORD * word,my_bool skip_stopwords)208 uchar maria_ft_simple_get_word(CHARSET_INFO *cs, uchar **start,
209 const uchar *end, FT_WORD *word,
210 my_bool skip_stopwords)
211 {
212 uchar *doc= *start;
213 uint mwc, length;
214 int ctype, mbl;
215 DBUG_ENTER("maria_ft_simple_get_word");
216
217 do
218 {
219 for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
220 {
221 if (doc >= end)
222 DBUG_RETURN(0);
223 mbl= my_ci_ctype(cs, &ctype, doc, end);
224 if (true_word_char(ctype, *doc))
225 break;
226 }
227
228 mwc= length= 0;
229 for (word->pos= doc; doc < end; length++,
230 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
231 {
232 mbl= my_ci_ctype(cs, &ctype, doc, end);
233 if (true_word_char(ctype, *doc))
234 mwc= 0;
235 else if (!misc_word_char(*doc) || mwc)
236 break;
237 else
238 mwc++;
239 }
240
241 word->len= (uint)(doc-word->pos) - mwc;
242
243 if (skip_stopwords == FALSE ||
244 (length >= ft_min_word_len && length < ft_max_word_len &&
245 !is_stopword((char *) word->pos, word->len)))
246 {
247 *start= doc;
248 DBUG_RETURN(1);
249 }
250 } while (doc < end);
251 DBUG_RETURN(0);
252 }
253
maria_ft_parse_init(TREE * wtree,CHARSET_INFO * cs)254 void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
255 {
256 DBUG_ENTER("maria_ft_parse_init");
257 if (!is_tree_inited(wtree))
258 init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp, NULL,
259 (void*) cs, MYF(0));
260 DBUG_VOID_RETURN;
261 }
262
263
maria_ft_add_word(MYSQL_FTPARSER_PARAM * param,const char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info)264 static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param,
265 const char *word, int word_len,
266 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info
267 __attribute__((unused)))
268 {
269 TREE *wtree;
270 FT_WORD w;
271 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
272 DBUG_ENTER("maria_ft_add_word");
273 wtree= ft_param->wtree;
274 if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
275 {
276 uchar *ptr;
277 DBUG_ASSERT(wtree->with_delete == 0);
278 ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
279 memcpy(ptr, word, word_len);
280 w.pos= ptr;
281 }
282 else
283 w.pos= (uchar*) word;
284 w.len= word_len;
285 if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
286 {
287 delete_tree(wtree, 0);
288 DBUG_RETURN(1);
289 }
290 DBUG_RETURN(0);
291 }
292
293
maria_ft_parse_internal(MYSQL_FTPARSER_PARAM * param,const char * doc_arg,int doc_len)294 static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
295 const char *doc_arg,
296 int doc_len)
297 {
298 uchar *doc= (uchar*) doc_arg;
299 uchar *end= doc + doc_len;
300 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
301 TREE *wtree= ft_param->wtree;
302 FT_WORD w;
303 DBUG_ENTER("maria_ft_parse_internal");
304
305 while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
306 if (param->mysql_add_word(param, (char*)w.pos, w.len, 0))
307 DBUG_RETURN(1);
308 DBUG_RETURN(0);
309 }
310
311
maria_ft_parse(TREE * wtree,uchar * doc,int doclen,struct st_mysql_ftparser * parser,MYSQL_FTPARSER_PARAM * param,MEM_ROOT * mem_root)312 int maria_ft_parse(TREE *wtree, uchar *doc, int doclen,
313 struct st_mysql_ftparser *parser,
314 MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
315 {
316 MY_FT_PARSER_PARAM my_param;
317 DBUG_ENTER("maria_ft_parse");
318 DBUG_ASSERT(parser);
319 my_param.wtree= wtree;
320 my_param.mem_root= mem_root;
321
322 param->mysql_parse= maria_ft_parse_internal;
323 param->mysql_add_word= maria_ft_add_word;
324 param->mysql_ftparam= &my_param;
325 param->cs= wtree->custom_arg;
326 param->doc= (char*)doc;
327 param->length= doclen;
328 param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
329 DBUG_RETURN(parser->parse(param));
330 }
331
332
333 #define MAX_PARAM_NR 2
334
maria_ftparser_alloc_param(MARIA_HA * info)335 MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info)
336 {
337 if (!info->ftparser_param)
338 {
339 /*
340 . info->ftparser_param can not be zero after the initialization,
341 because it always includes built-in fulltext parser. And built-in
342 parser can be called even if the table has no fulltext indexes and
343 no varchar/text fields.
344
345 ftb_find_relevance... parser (ftb_find_relevance_parse,
346 ftb_find_relevance_add_word) calls ftb_check_phrase... parser
347 (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
348 */
349 info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
350 my_malloc(PSI_INSTRUMENT_ME, MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
351 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
352 init_alloc_root(PSI_INSTRUMENT_ME, &info->ft_memroot,
353 FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0));
354 }
355 return info->ftparser_param;
356 }
357
358
maria_ftparser_call_initializer(MARIA_HA * info,uint keynr,uint paramnr)359 MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
360 uint keynr, uint paramnr)
361 {
362 uint32 ftparser_nr;
363 struct st_mysql_ftparser *parser;
364
365 if (!maria_ftparser_alloc_param(info))
366 return 0;
367
368 if (keynr == NO_SUCH_KEY)
369 {
370 ftparser_nr= 0;
371 parser= &ft_default_parser;
372 }
373 else
374 {
375 ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
376 parser= info->s->keyinfo[keynr].parser;
377 }
378 DBUG_ASSERT(paramnr < MAX_PARAM_NR);
379 ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
380 if (! info->ftparser_param[ftparser_nr].mysql_add_word)
381 {
382 /* Note, that mysql_add_word is used here as a flag:
383 mysql_add_word == 0 - parser is not initialized
384 mysql_add_word != 0 - parser is initialized, or no
385 initialization needed. */
386 info->ftparser_param[ftparser_nr].mysql_add_word=
387 (int (*)(struct st_mysql_ftparser_param *, const char *,
388 int, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
389 if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
390 return 0;
391 }
392 return &info->ftparser_param[ftparser_nr];
393 }
394
395
maria_ftparser_call_deinitializer(MARIA_HA * info)396 void maria_ftparser_call_deinitializer(MARIA_HA *info)
397 {
398 uint i, j, keys= info->s->state.header.keys;
399 free_root(&info->ft_memroot, MYF(0));
400 if (! info->ftparser_param)
401 return;
402 for (i= 0; i < keys; i++)
403 {
404 MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i];
405 for (j=0; j < MAX_PARAM_NR; j++)
406 {
407 MYSQL_FTPARSER_PARAM *ftparser_param=
408 &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j];
409 if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
410 {
411 if (keyinfo->parser->deinit)
412 keyinfo->parser->deinit(ftparser_param);
413 ftparser_param->mysql_add_word= 0;
414 }
415 else
416 break;
417 }
418 }
419 }
420