1 /* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
15
16 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
17
18 #include "ftdefs.h"
19
20 typedef struct st_ft_docstat {
21 FT_WORD *list;
22 uint uniq;
23 double sum;
24 } FT_DOCSTAT;
25
26 typedef struct st_my_ft_parser_param
27 {
28 TREE *wtree;
29 MEM_ROOT *mem_root;
30 } MY_FT_PARSER_PARAM;
31
FT_WORD_cmp(CHARSET_INFO * cs,FT_WORD * w1,FT_WORD * w2)32 static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
33 {
34 return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
35 (uchar*) w2->pos, w2->len, 0);
36 }
37
walk_and_copy(FT_WORD * word,uint32 count,FT_DOCSTAT * docstat)38 static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
39 {
40 word->weight=LWS_IN_USE;
41 docstat->sum+=word->weight;
42 memcpy((docstat->list)++, word, sizeof(FT_WORD));
43 return 0;
44 }
45
46 /* transforms tree of words into the array, applying normalization */
47
ft_linearize(TREE * wtree,MEM_ROOT * mem_root)48 FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
49 {
50 FT_WORD *wlist,*p;
51 FT_DOCSTAT docstat;
52 DBUG_ENTER("ft_linearize");
53
54 if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
55 (1+wtree->elements_in_tree))))
56 {
57 docstat.list=wlist;
58 docstat.uniq=wtree->elements_in_tree;
59 docstat.sum=0;
60 tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
61 }
62 delete_tree(wtree, 0);
63 if (!wlist)
64 DBUG_RETURN(NULL);
65
66 docstat.list->pos=NULL;
67
68 for (p=wlist;p->pos;p++)
69 {
70 p->weight=PRENORM_IN_USE;
71 }
72
73 for (p=wlist;p->pos;p++)
74 {
75 p->weight/=NORM_IN_USE;
76 }
77
78 DBUG_RETURN(wlist);
79 }
80
ft_boolean_check_syntax_string(const uchar * str,size_t length,CHARSET_INFO * cs)81 my_bool ft_boolean_check_syntax_string(const uchar *str, size_t length,
82 CHARSET_INFO *cs)
83 {
84 uint i, j;
85
86 if (cs->mbminlen != 1)
87 {
88 DBUG_ASSERT(0);
89 return 1;
90 }
91
92 if (!str ||
93 (length + 1 != sizeof(DEFAULT_FTB_SYNTAX)) ||
94 (str[0] != ' ' && str[1] != ' '))
95 return 1;
96 for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++)
97 {
98 /* limiting to 7-bit ascii only */
99 if ((unsigned char)(str[i]) > 127 || my_isalnum(cs, str[i]))
100 return 1;
101 for (j=0; j<i; j++)
102 if (str[i] == str[j] && (i != 11 || j != 10))
103 return 1;
104 }
105 return 0;
106 }
107
108 /*
109 RETURN VALUE
110 0 - eof
111 1 - word found
112 2 - left bracket
113 3 - right bracket
114 4 - stopword found
115 */
ft_get_word(CHARSET_INFO * cs,const uchar ** start,const uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * param)116 uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end,
117 FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
118 {
119 const uchar *doc=*start;
120 int ctype;
121 uint mwc, length;
122 int mbl;
123
124 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
125 param->weight_adjust= param->wasign= 0;
126 param->type= FT_TOKEN_EOF;
127
128 while (doc<end)
129 {
130 for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
131 {
132 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
133 if (true_word_char(ctype, *doc))
134 break;
135 if (*doc == FTB_RQUOT && param->quot)
136 {
137 *start=doc+1;
138 param->type= FT_TOKEN_RIGHT_PAREN;
139 goto ret;
140 }
141 if (!param->quot)
142 {
143 if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
144 {
145 /* param->prev=' '; */
146 *start=doc+1;
147 if (*doc == FTB_LQUOT)
148 param->quot= (char*) 1;
149 param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
150 goto ret;
151 }
152 if (param->prev == ' ')
153 {
154 if (*doc == FTB_YES ) { param->yesno=+1; continue; } else
155 if (*doc == FTB_EGAL) { param->yesno= 0; continue; } else
156 if (*doc == FTB_NO ) { param->yesno=-1; continue; } else
157 if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
158 if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
159 if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
160 }
161 }
162 param->prev=*doc;
163 param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
164 param->weight_adjust= param->wasign= 0;
165 }
166
167 mwc=length=0;
168 for (word->pos= doc; doc < end; length++,
169 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
170 {
171 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
172 if (true_word_char(ctype, *doc))
173 mwc=0;
174 else if (!misc_word_char(*doc) || mwc)
175 break;
176 else
177 mwc++;
178 }
179 param->prev='A'; /* be sure *prev is true_word_char */
180 word->len= (uint)(doc-word->pos) - mwc;
181 if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
182 doc++;
183
184 if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
185 word->len))
186 || param->trunc) && length < ft_max_word_len)
187 {
188 *start=doc;
189 param->type= FT_TOKEN_WORD;
190 goto ret;
191 }
192 else if (length) /* make sure length > 0 (if start contains spaces only) */
193 {
194 *start= doc;
195 param->type= FT_TOKEN_STOPWORD;
196 goto ret;
197 }
198 }
199 if (param->quot)
200 {
201 *start= doc;
202 param->type= 3; /* FT_RBR */
203 goto ret;
204 }
205 ret:
206 return param->type;
207 }
208
ft_simple_get_word(CHARSET_INFO * cs,uchar ** start,const uchar * end,FT_WORD * word,my_bool skip_stopwords)209 uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
210 FT_WORD *word, my_bool skip_stopwords)
211 {
212 uchar *doc= *start;
213 uint mwc, length;
214 int mbl;
215 int ctype;
216 DBUG_ENTER("ft_simple_get_word");
217
218 do
219 {
220 for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
221 {
222 if (doc >= end)
223 DBUG_RETURN(0);
224 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
225 if (true_word_char(ctype, *doc))
226 break;
227 }
228
229 mwc= length= 0;
230 for (word->pos= doc; doc < end; length++,
231 doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
232 {
233 mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
234 if (true_word_char(ctype, *doc))
235 mwc= 0;
236 else if (!misc_word_char(*doc) || mwc)
237 break;
238 else
239 mwc++;
240 }
241
242 word->len= (uint)(doc-word->pos) - mwc;
243
244 if (skip_stopwords == FALSE ||
245 (length >= ft_min_word_len && length < ft_max_word_len &&
246 !is_stopword((char*) word->pos, word->len)))
247 {
248 *start= doc;
249 DBUG_RETURN(1);
250 }
251 } while (doc < end);
252 DBUG_RETURN(0);
253 }
254
ft_parse_init(TREE * wtree,CHARSET_INFO * cs)255 void ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
256 {
257 DBUG_ENTER("ft_parse_init");
258 if (!is_tree_inited(wtree))
259 init_tree(wtree, 0, 0, sizeof(FT_WORD), (qsort_cmp2)&FT_WORD_cmp, 0,
260 (void*)cs, MYF(0));
261 DBUG_VOID_RETURN;
262 }
263
264
ft_add_word(MYSQL_FTPARSER_PARAM * param,const char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info)265 static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
266 const char *word, int word_len,
267 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
268 {
269 TREE *wtree;
270 FT_WORD w;
271 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
272 DBUG_ENTER("ft_add_word");
273 wtree= ft_param->wtree;
274 if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
275 {
276 uchar *ptr;
277 DBUG_ASSERT(wtree->with_delete == 0);
278 ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
279 memcpy(ptr, word, word_len);
280 w.pos= ptr;
281 }
282 else
283 w.pos= (uchar*) word;
284 w.len= word_len;
285 if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
286 {
287 delete_tree(wtree, 0);
288 DBUG_RETURN(1);
289 }
290 DBUG_RETURN(0);
291 }
292
293
ft_parse_internal(MYSQL_FTPARSER_PARAM * param,const char * doc_arg,int doc_len)294 static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
295 const char *doc_arg, int doc_len)
296 {
297 uchar *doc= (uchar*) doc_arg;
298 uchar *end= doc + doc_len;
299 MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
300 TREE *wtree= ft_param->wtree;
301 FT_WORD w;
302 DBUG_ENTER("ft_parse_internal");
303
304 while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
305 if (param->mysql_add_word(param, (char*) w.pos, (int)w.len, 0))
306 DBUG_RETURN(1);
307 DBUG_RETURN(0);
308 }
309
310
ft_parse(TREE * wtree,uchar * doc,int doclen,struct st_mysql_ftparser * parser,MYSQL_FTPARSER_PARAM * param,MEM_ROOT * mem_root)311 int ft_parse(TREE *wtree, uchar *doc, int doclen,
312 struct st_mysql_ftparser *parser,
313 MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
314 {
315 MY_FT_PARSER_PARAM my_param;
316 DBUG_ENTER("ft_parse");
317 DBUG_ASSERT(parser);
318
319 my_param.wtree= wtree;
320 my_param.mem_root= mem_root;
321
322 param->mysql_parse= ft_parse_internal;
323 param->mysql_add_word= ft_add_word;
324 param->mysql_ftparam= &my_param;
325 param->cs= wtree->custom_arg;
326 param->doc= (char*) doc;
327 param->length= doclen;
328 param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
329 DBUG_RETURN(parser->parse(param));
330 }
331
332
333 #define MAX_PARAM_NR 2
334
ftparser_alloc_param(MI_INFO * info)335 MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
336 {
337 if (!info->ftparser_param)
338 {
339 /*
340 . info->ftparser_param can not be zero after the initialization,
341 because it always includes built-in fulltext parser. And built-in
342 parser can be called even if the table has no fulltext indexes and
343 no varchar/text fields.
344
345 ftb_find_relevance... parser (ftb_find_relevance_parse,
346 ftb_find_relevance_add_word) calls ftb_check_phrase... parser
347 (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
348 */
349 info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
350 my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
351 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
352 init_alloc_root(&info->ft_memroot, "fulltext_parser",
353 FTPARSER_MEMROOT_ALLOC_SIZE, 0, MYF(0));
354 }
355 return info->ftparser_param;
356 }
357
358
ftparser_call_initializer(MI_INFO * info,uint keynr,uint paramnr)359 MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
360 uint keynr, uint paramnr)
361 {
362 uint32 ftparser_nr;
363 struct st_mysql_ftparser *parser;
364
365 if (!ftparser_alloc_param(info))
366 return 0;
367
368 if (keynr == NO_SUCH_KEY)
369 {
370 ftparser_nr= 0;
371 parser= &ft_default_parser;
372 }
373 else
374 {
375 ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
376 parser= info->s->keyinfo[keynr].parser;
377 }
378 DBUG_ASSERT(paramnr < MAX_PARAM_NR);
379 ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
380 if (! info->ftparser_param[ftparser_nr].mysql_add_word)
381 {
382 /* Note, that mysql_add_word is used here as a flag:
383 mysql_add_word == 0 - parser is not initialized
384 mysql_add_word != 0 - parser is initialized, or no
385 initialization needed. */
386 info->ftparser_param[ftparser_nr].mysql_add_word=
387 (int (*)(struct st_mysql_ftparser_param *, const char *, int,
388 MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
389 if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
390 return 0;
391 }
392 return &info->ftparser_param[ftparser_nr];
393 }
394
ftparser_call_deinitializer(MI_INFO * info)395 void ftparser_call_deinitializer(MI_INFO *info)
396 {
397 uint i, j, keys= info->s->state.header.keys;
398 free_root(&info->ft_memroot, MYF(0));
399 if (! info->ftparser_param)
400 return;
401 for (i= 0; i < keys; i++)
402 {
403 MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
404 for (j=0; j < MAX_PARAM_NR; j++)
405 {
406 MYSQL_FTPARSER_PARAM *ftparser_param=
407 &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j];
408 if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
409 {
410 if (keyinfo->parser->deinit)
411 keyinfo->parser->deinit(ftparser_param);
412 ftparser_param->mysql_add_word= 0;
413 }
414 else
415 break;
416 }
417 }
418 }
419
420