1 /* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights
2 reserved.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
25
26 #include "ftdefs.h"
27 #include "ctype.h"
28
29 typedef struct st_ft_docstat {
30 FT_WORD *list;
31 uint uniq;
32 double sum;
33 } FT_DOCSTAT;
34
35 typedef struct st_my_ft_parser_param
36 {
37 TREE *wtree;
38 MEM_ROOT *mem_root;
39 } MY_FT_PARSER_PARAM;
40
FT_WORD_cmp(CHARSET_INFO * cs,FT_WORD * w1,FT_WORD * w2)41 static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
42 {
43 return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
44 (uchar*) w2->pos, w2->len, 0, 0);
45 }
46
walk_and_copy(FT_WORD * word,uint32 count,FT_DOCSTAT * docstat)47 static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
48 {
49 word->weight=LWS_IN_USE;
50 docstat->sum+=word->weight;
51 memcpy((docstat->list)++, word, sizeof(FT_WORD));
52 return 0;
53 }
54
55 /* transforms tree of words into the array, applying normalization */
56
ft_linearize(TREE * wtree,MEM_ROOT * mem_root)57 FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
58 {
59 FT_WORD *wlist,*p;
60 FT_DOCSTAT docstat;
61 DBUG_ENTER("ft_linearize");
62
63 if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
64 (1+wtree->elements_in_tree))))
65 {
66 docstat.list=wlist;
67 docstat.uniq=wtree->elements_in_tree;
68 docstat.sum=0;
69 tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
70 }
71 delete_tree(wtree);
72 if (!wlist)
73 DBUG_RETURN(NULL);
74
75 docstat.list->pos=NULL;
76
77 for (p=wlist;p->pos;p++)
78 {
79 p->weight=PRENORM_IN_USE;
80 }
81
82 for (p=wlist;p->pos;p++)
83 {
84 p->weight/=NORM_IN_USE;
85 }
86
87 DBUG_RETURN(wlist);
88 }
89
ft_boolean_check_syntax_string(const uchar * str)90 my_bool ft_boolean_check_syntax_string(const uchar *str)
91 {
92 uint i, j;
93
94 if (!str ||
95 (strlen((char*) str)+1 != sizeof(DEFAULT_FTB_SYNTAX)) ||
96 (str[0] != ' ' && str[1] != ' '))
97 return 1;
98 for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++)
99 {
100 /* limiting to 7-bit ascii only */
101 if ((unsigned char)(str[i]) > 127 || isalnum(str[i]))
102 return 1;
103 for (j=0; j<i; j++)
104 if (str[i] == str[j] && (i != 11 || j != 10))
105 return 1;
106 }
107 return 0;
108 }
109
110 /*
111 RETURN VALUE
112 0 - eof
113 1 - word found
114 2 - left bracket
115 3 - right bracket
116 4 - stopword found
117 */
ft_get_word(const CHARSET_INFO * cs,uchar ** start,uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * param)118 uchar ft_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end,
119 FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
120 {
121 uchar *doc=*start;
122 int ctype;
123 uint mwc, length;
124 int mbl;
125
126 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
127 param->weight_adjust= param->wasign= 0;
128 param->type= FT_TOKEN_EOF;
129
130 while (doc<end)
131 {
132 for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
133 {
134 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
135 if (true_word_char(ctype, *doc))
136 break;
137 if (*doc == FTB_RQUOT && param->quot)
138 {
139 *start=doc+1;
140 param->type= FT_TOKEN_RIGHT_PAREN;
141 goto ret;
142 }
143 if (!param->quot)
144 {
145 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
146 {
147 /* param->prev=' '; */
148 *start=doc+1;
149 if (*doc == FTB_LQUOT)
150 param->quot= (char*) 1;
151 param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
152 goto ret;
153 }
154 if (param->prev == ' ')
155 {
156 if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
157 if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
158 if (*doc == FTB_NO ) { param->yesno=-1; continue; } else
159 if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
160 if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
161 if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
162 }
163 }
164 param->prev=*doc;
165 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
166 param->weight_adjust= param->wasign= 0;
167 }
168
169 mwc=length=0;
170 for (word->pos= doc; doc < end; length++,
171 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
172 {
173 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
174 if (true_word_char(ctype, *doc))
175 mwc=0;
176 else if (!misc_word_char(*doc) || mwc)
177 break;
178 else
179 mwc++;
180 }
181 param->prev='A'; /* be sure *prev is true_word_char */
182 word->len= (uint)(doc-word->pos) - mwc;
183 if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
184 doc++;
185
186 if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
187 word->len))
188 || param->trunc) && length < ft_max_word_len)
189 {
190 *start=doc;
191 param->type= FT_TOKEN_WORD;
192 goto ret;
193 }
194 else if (length) /* make sure length > 0 (if start contains spaces only) */
195 {
196 *start= doc;
197 param->type= FT_TOKEN_STOPWORD;
198 goto ret;
199 }
200 }
201 if (param->quot)
202 {
203 *start= doc;
204 param->type= 3; /* FT_RBR */
205 goto ret;
206 }
207 ret:
208 return param->type;
209 }
210
ft_simple_get_word(const CHARSET_INFO * cs,uchar ** start,const uchar * end,FT_WORD * word,my_bool skip_stopwords)211 uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start,
212 const uchar *end,
213 FT_WORD *word, my_bool skip_stopwords)
214 {
215 uchar *doc= *start;
216 uint mwc, length;
217 int mbl;
218 int ctype;
219 DBUG_ENTER("ft_simple_get_word");
220
221 do
222 {
223 for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
224 {
225 if (doc >= end)
226 DBUG_RETURN(0);
227 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
228 if (true_word_char(ctype, *doc))
229 break;
230 }
231
232 mwc= length= 0;
233 for (word->pos= doc; doc < end; length++,
234 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
235 {
236 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
237 if (true_word_char(ctype, *doc))
238 mwc= 0;
239 else if (!misc_word_char(*doc) || mwc)
240 break;
241 else
242 mwc++;
243 }
244
245 word->len= (uint)(doc-word->pos) - mwc;
246
247 if (skip_stopwords == FALSE ||
248 (length >= ft_min_word_len && length < ft_max_word_len &&
249 !is_stopword((char*) word->pos, word->len)))
250 {
251 *start= doc;
252 DBUG_RETURN(1);
253 }
254 } while (doc < end);
255 DBUG_RETURN(0);
256 }
257
ft_parse_init(TREE * wtree,const CHARSET_INFO * cs)258 void ft_parse_init(TREE *wtree, const CHARSET_INFO *cs)
259 {
260 DBUG_ENTER("ft_parse_init");
261 if (!is_tree_inited(wtree))
262 init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
263 DBUG_VOID_RETURN;
264 }
265
266
ft_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))267 static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
268 char *word, int word_len,
269 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
270 {
271 TREE *wtree;
272 FT_WORD w;
273 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
274 DBUG_ENTER("ft_add_word");
275 wtree= ft_param->wtree;
276 if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
277 {
278 uchar *ptr;
279 assert(wtree->with_delete == 0);
280 ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
281 memcpy(ptr, word, word_len);
282 w.pos= ptr;
283 }
284 else
285 w.pos= (uchar*) word;
286 w.len= word_len;
287 if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
288 {
289 delete_tree(wtree);
290 DBUG_RETURN(1);
291 }
292 DBUG_RETURN(0);
293 }
294
295
ft_parse_internal(MYSQL_FTPARSER_PARAM * param,char * doc_arg,int doc_len)296 static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
297 char *doc_arg, int doc_len)
298 {
299 uchar *doc= (uchar*) doc_arg;
300 uchar *end= doc + doc_len;
301 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
302 TREE *wtree= ft_param->wtree;
303 FT_WORD w;
304 DBUG_ENTER("ft_parse_internal");
305
306 while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
307 if (param->mysql_add_word(param, (char*) w.pos, w.len, 0))
308 DBUG_RETURN(1);
309 DBUG_RETURN(0);
310 }
311
312
ft_parse(TREE * wtree,uchar * doc,int doclen,struct st_mysql_ftparser * parser,MYSQL_FTPARSER_PARAM * param,MEM_ROOT * mem_root)313 int ft_parse(TREE *wtree, uchar *doc, int doclen,
314 struct st_mysql_ftparser *parser,
315 MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
316 {
317 MY_FT_PARSER_PARAM my_param;
318 DBUG_ENTER("ft_parse");
319 assert(parser);
320
321 my_param.wtree= wtree;
322 my_param.mem_root= mem_root;
323
324 param->mysql_parse= ft_parse_internal;
325 param->mysql_add_word= ft_add_word;
326 param->mysql_ftparam= &my_param;
327 param->cs= wtree->custom_arg;
328 param->doc= (char*) doc;
329 param->length= doclen;
330 param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
331 DBUG_RETURN(parser->parse(param));
332 }
333
334
335 #define MAX_PARAM_NR 2
336
ftparser_alloc_param(MI_INFO * info)337 MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
338 {
339 if (!info->ftparser_param)
340 {
341 /*
342 . info->ftparser_param can not be zero after the initialization,
343 because it always includes built-in fulltext parser. And built-in
344 parser can be called even if the table has no fulltext indexes and
345 no varchar/text fields.
346
347 ftb_find_relevance... parser (ftb_find_relevance_parse,
348 ftb_find_relevance_add_word) calls ftb_check_phrase... parser
349 (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
350 */
351 info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
352 my_malloc(mi_key_memory_FTPARSER_PARAM,
353 MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
354 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
355 init_alloc_root(mi_key_memory_ft_memroot,
356 &info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
357 }
358 return info->ftparser_param;
359 }
360
361
ftparser_call_initializer(MI_INFO * info,uint keynr,uint paramnr)362 MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
363 uint keynr, uint paramnr)
364 {
365 uint32 ftparser_nr;
366 struct st_mysql_ftparser *parser;
367
368 if (!ftparser_alloc_param(info))
369 return 0;
370
371 if (keynr == NO_SUCH_KEY)
372 {
373 ftparser_nr= 0;
374 parser= &ft_default_parser;
375 }
376 else
377 {
378 ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
379 parser= info->s->keyinfo[keynr].parser;
380 }
381 assert(paramnr < MAX_PARAM_NR);
382 ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
383 if (! info->ftparser_param[ftparser_nr].mysql_add_word)
384 {
385 /* Note, that mysql_add_word is used here as a flag:
386 mysql_add_word == 0 - parser is not initialized
387 mysql_add_word != 0 - parser is initialized, or no
388 initialization needed. */
389 info->ftparser_param[ftparser_nr].mysql_add_word=
390 (int (*)(struct st_mysql_ftparser_param *, char *, int,
391 MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
392 if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
393 return 0;
394 }
395 return &info->ftparser_param[ftparser_nr];
396 }
397
ftparser_call_deinitializer(MI_INFO * info)398 void ftparser_call_deinitializer(MI_INFO *info)
399 {
400 uint i, j, keys= info->s->state.header.keys;
401 free_root(&info->ft_memroot, MYF(0));
402 if (! info->ftparser_param)
403 return;
404 for (i= 0; i < keys; i++)
405 {
406 MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
407 for (j=0; j < MAX_PARAM_NR; j++)
408 {
409 MYSQL_FTPARSER_PARAM *ftparser_param=
410 &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j];
411 if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
412 {
413 if (keyinfo->parser->deinit)
414 keyinfo->parser->deinit(ftparser_param);
415 ftparser_param->mysql_add_word= 0;
416 }
417 else
418 break;
419 }
420 }
421 }
422
423