1 /* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights
2 reserved.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23
24 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
25
26 #include "ftdefs.h"
27 #include "ctype.h"
28
29 typedef struct st_ft_docstat {
30 FT_WORD *list;
31 uint uniq;
32 double sum;
33 } FT_DOCSTAT;
34
35 typedef struct st_my_ft_parser_param
36 {
37 TREE *wtree;
38 MEM_ROOT *mem_root;
39 } MY_FT_PARSER_PARAM;
40
FT_WORD_cmp(CHARSET_INFO * cs,FT_WORD * w1,FT_WORD * w2)41 static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
42 {
43 return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
44 (uchar*) w2->pos, w2->len, 0, 0);
45 }
46
walk_and_copy(FT_WORD * word,uint32 count,FT_DOCSTAT * docstat)47 static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
48 {
49 word->weight=LWS_IN_USE;
50 docstat->sum+=word->weight;
51 memcpy((docstat->list)++, word, sizeof(FT_WORD));
52 return 0;
53 }
54
55 /* transforms tree of words into the array, applying normalization */
56
ft_linearize(TREE * wtree,MEM_ROOT * mem_root)57 FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
58 {
59 FT_WORD *wlist,*p;
60 FT_DOCSTAT docstat;
61 DBUG_ENTER("ft_linearize");
62
63 if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
64 (1+wtree->elements_in_tree))))
65 {
66 docstat.list=wlist;
67 docstat.uniq=wtree->elements_in_tree;
68 docstat.sum=0;
69 tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
70 }
71 delete_tree(wtree);
72 if (!wlist)
73 DBUG_RETURN(NULL);
74
75 docstat.list->pos=NULL;
76
77 for (p=wlist;p->pos;p++)
78 {
79 p->weight=PRENORM_IN_USE;
80 }
81
82 for (p=wlist;p->pos;p++)
83 {
84 p->weight/=NORM_IN_USE;
85 }
86
87 DBUG_RETURN(wlist);
88 }
89
ft_boolean_check_syntax_string(const uchar * str)90 my_bool ft_boolean_check_syntax_string(const uchar *str)
91 {
92 uint i, j;
93
94 if (!str ||
95 (strlen((char*) str)+1 != sizeof(DEFAULT_FTB_SYNTAX)) ||
96 (str[0] != ' ' && str[1] != ' '))
97 return 1;
98 for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++)
99 {
100 /* limiting to 7-bit ascii only */
101 if ((unsigned char)(str[i]) > 127 || isalnum(str[i]))
102 return 1;
103 for (j=0; j<i; j++)
104 if (str[i] == str[j] && (i != 11 || j != 10))
105 return 1;
106 }
107 return 0;
108 }
109
110 /*
111 RETURN VALUE
112 0 - eof
113 1 - word found
114 2 - left bracket
115 3 - right bracket
116 4 - stopword found
117 */
ft_get_word(const CHARSET_INFO * cs,uchar ** start,uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * param)118 uchar ft_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end,
119 FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
120 {
121 uchar *doc=*start;
122 int ctype;
123 uint mwc, length;
124 int mbl;
125
126 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
127 param->weight_adjust= param->wasign= 0;
128 param->type= FT_TOKEN_EOF;
129
130 while (doc<end)
131 {
132 for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
133 {
134 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
135 if (true_word_char(ctype, *doc))
136 break;
137 if (*doc == FTB_RQUOT && param->quot)
138 {
139 *start=doc+1;
140 param->type= FT_TOKEN_RIGHT_PAREN;
141 goto ret;
142 }
143 if (!param->quot)
144 {
145 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
146 {
147 /* param->prev=' '; */
148 *start=doc+1;
149 if (*doc == FTB_LQUOT)
150 param->quot= (char*) 1;
151 param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
152 goto ret;
153 }
154 if (param->prev == ' ')
155 {
156 if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
157 if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
158 if (*doc == FTB_NO ) { param->yesno=-1; continue; } else
159 if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
160 if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
161 if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
162 }
163 }
164 param->prev=*doc;
165 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
166 param->weight_adjust= param->wasign= 0;
167 }
168
169 mwc=length=0;
170 for (word->pos= doc; doc < end; length++,
171 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
172 {
173 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
174 if (true_word_char(ctype, *doc))
175 mwc=0;
176 else if (!misc_word_char(*doc) || mwc)
177 break;
178 else
179 mwc++;
180 }
181 param->prev='A'; /* be sure *prev is true_word_char */
182 word->len= (uint)(doc-word->pos) - mwc;
183 if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
184 doc++;
185
186 if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
187 word->len))
188 || param->trunc) && length < ft_max_word_len)
189 {
190 *start=doc;
191 param->type= FT_TOKEN_WORD;
192 goto ret;
193 }
194 else if (length) /* make sure length > 0 (if start contains spaces only) */
195 {
196 *start= doc;
197 param->type= FT_TOKEN_STOPWORD;
198 goto ret;
199 }
200 }
201 if (param->quot)
202 {
203 *start= doc;
204 param->type= 3; /* FT_RBR */
205 goto ret;
206 }
207 ret:
208 return param->type;
209 }
210
ft_simple_get_word(const CHARSET_INFO * cs,uchar ** start,const uchar * end,FT_WORD * word,my_bool skip_stopwords)211 uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start,
212 const uchar *end,
213 FT_WORD *word, my_bool skip_stopwords)
214 {
215 uchar *doc= *start;
216 uint mwc, length;
217 int mbl;
218 int ctype;
219 DBUG_ENTER("ft_simple_get_word");
220
221 do
222 {
223 for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
224 {
225 if (doc >= end)
226 DBUG_RETURN(0);
227 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
228 if (true_word_char(ctype, *doc))
229 break;
230 }
231
232 mwc= length= 0;
233 for (word->pos= doc; doc < end; length++,
234 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
235 {
236 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
237 if (true_word_char(ctype, *doc))
238 mwc= 0;
239 else if (!misc_word_char(*doc) || mwc)
240 break;
241 else
242 mwc++;
243 }
244
245 word->len= (uint)(doc-word->pos) - mwc;
246
247 if (skip_stopwords == FALSE ||
248 (length >= ft_min_word_len && length < ft_max_word_len &&
249 !is_stopword((char*) word->pos, word->len)))
250 {
251 *start= doc;
252 DBUG_RETURN(1);
253 }
254 } while (doc < end);
255 DBUG_RETURN(0);
256 }
257
ft_parse_init(TREE * wtree,const CHARSET_INFO * cs)258 void ft_parse_init(TREE *wtree, const CHARSET_INFO *cs)
259 {
260 DBUG_ENTER("ft_parse_init");
261 if (!is_tree_inited(wtree))
262 init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
263 DBUG_VOID_RETURN;
264 }
265
266
ft_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))267 static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
268 char *word, int word_len,
269 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
270 {
271 TREE *wtree;
272 FT_WORD w;
273 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
274 DBUG_ENTER("ft_add_word");
275 wtree= ft_param->wtree;
276 if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
277 {
278 uchar *ptr;
279 DBUG_ASSERT(wtree->with_delete == 0);
280 ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
281 memcpy(ptr, word, word_len);
282 w.pos= ptr;
283 }
284 else
285 w.pos= (uchar*) word;
286 w.len= word_len;
287 if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
288 {
289 delete_tree(wtree);
290 DBUG_RETURN(1);
291 }
292 DBUG_RETURN(0);
293 }
294
295
ft_parse_internal(MYSQL_FTPARSER_PARAM * param,char * doc_arg,int doc_len)296 static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
297 char *doc_arg, int doc_len)
298 {
299 uchar *doc= (uchar*) doc_arg;
300 uchar *end= doc + doc_len;
301 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
302 TREE *wtree= ft_param->wtree;
303 FT_WORD w;
304 DBUG_ENTER("ft_parse_internal");
305
306 while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
307 if (param->mysql_add_word(param, (char*) w.pos, w.len, 0))
308 DBUG_RETURN(1);
309 DBUG_RETURN(0);
310 }
311
312
ft_parse(TREE * wtree,uchar * doc,int doclen,struct st_mysql_ftparser * parser,MYSQL_FTPARSER_PARAM * param,MEM_ROOT * mem_root)313 int ft_parse(TREE *wtree, uchar *doc, int doclen,
314 struct st_mysql_ftparser *parser,
315 MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
316 {
317 MY_FT_PARSER_PARAM my_param;
318 DBUG_ENTER("ft_parse");
319 DBUG_ASSERT(parser);
320
321 my_param.wtree= wtree;
322 my_param.mem_root= mem_root;
323
324 param->mysql_parse= ft_parse_internal;
325 param->mysql_add_word= ft_add_word;
326 param->mysql_ftparam= &my_param;
327 param->cs= wtree->custom_arg;
328 param->doc= (char*) doc;
329 param->length= doclen;
330 param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
331 DBUG_RETURN(parser->parse(param));
332 }
333
334
335 #define MAX_PARAM_NR 2
336
ftparser_alloc_param(MI_INFO * info)337 MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
338 {
339 if (!info->ftparser_param)
340 {
341 /*
342 . info->ftparser_param can not be zero after the initialization,
343 because it always includes built-in fulltext parser. And built-in
344 parser can be called even if the table has no fulltext indexes and
345 no varchar/text fields.
346
347 ftb_find_relevance... parser (ftb_find_relevance_parse,
348 ftb_find_relevance_add_word) calls ftb_check_phrase... parser
349 (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
350 */
351 info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
352 my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
353 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
354 init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
355 }
356 return info->ftparser_param;
357 }
358
359
ftparser_call_initializer(MI_INFO * info,uint keynr,uint paramnr)360 MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
361 uint keynr, uint paramnr)
362 {
363 uint32 ftparser_nr;
364 struct st_mysql_ftparser *parser;
365
366 if (!ftparser_alloc_param(info))
367 return 0;
368
369 if (keynr == NO_SUCH_KEY)
370 {
371 ftparser_nr= 0;
372 parser= &ft_default_parser;
373 }
374 else
375 {
376 ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
377 parser= info->s->keyinfo[keynr].parser;
378 }
379 DBUG_ASSERT(paramnr < MAX_PARAM_NR);
380 ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
381 if (! info->ftparser_param[ftparser_nr].mysql_add_word)
382 {
383 /* Note, that mysql_add_word is used here as a flag:
384 mysql_add_word == 0 - parser is not initialized
385 mysql_add_word != 0 - parser is initialized, or no
386 initialization needed. */
387 info->ftparser_param[ftparser_nr].mysql_add_word=
388 (int (*)(struct st_mysql_ftparser_param *, char *, int,
389 MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
390 if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
391 return 0;
392 }
393 return &info->ftparser_param[ftparser_nr];
394 }
395
ftparser_call_deinitializer(MI_INFO * info)396 void ftparser_call_deinitializer(MI_INFO *info)
397 {
398 uint i, j, keys= info->s->state.header.keys;
399 free_root(&info->ft_memroot, MYF(0));
400 if (! info->ftparser_param)
401 return;
402 for (i= 0; i < keys; i++)
403 {
404 MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
405 for (j=0; j < MAX_PARAM_NR; j++)
406 {
407 MYSQL_FTPARSER_PARAM *ftparser_param=
408 &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j];
409 if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
410 {
411 if (keyinfo->parser->deinit)
412 keyinfo->parser->deinit(ftparser_param);
413 ftparser_param->mysql_add_word= 0;
414 }
415 else
416 break;
417 }
418 }
419 }
420
421