1 /* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
15 
16 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
17 
18 #include "ftdefs.h"
19 #include "ctype.h"
20 
21 typedef struct st_ft_docstat {
22   FT_WORD *list;
23   uint uniq;
24   double sum;
25 } FT_DOCSTAT;
26 
27 typedef struct st_my_ft_parser_param
28 {
29   TREE     *wtree;
30   MEM_ROOT *mem_root;
31 } MY_FT_PARSER_PARAM;
32 
FT_WORD_cmp(CHARSET_INFO * cs,FT_WORD * w1,FT_WORD * w2)33 static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
34 {
35   return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
36                          (uchar*) w2->pos, w2->len, 0, 0);
37 }
38 
walk_and_copy(FT_WORD * word,uint32 count,FT_DOCSTAT * docstat)39 static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
40 {
41     word->weight=LWS_IN_USE;
42     docstat->sum+=word->weight;
43     memcpy((docstat->list)++, word, sizeof(FT_WORD));
44     return 0;
45 }
46 
47 /* transforms tree of words into the array, applying normalization */
48 
ft_linearize(TREE * wtree,MEM_ROOT * mem_root)49 FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
50 {
51   FT_WORD *wlist,*p;
52   FT_DOCSTAT docstat;
53   DBUG_ENTER("ft_linearize");
54 
55   if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
56                                     (1+wtree->elements_in_tree))))
57   {
58     docstat.list=wlist;
59     docstat.uniq=wtree->elements_in_tree;
60     docstat.sum=0;
61     tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
62   }
63   delete_tree(wtree);
64   if (!wlist)
65     DBUG_RETURN(NULL);
66 
67   docstat.list->pos=NULL;
68 
69   for (p=wlist;p->pos;p++)
70   {
71     p->weight=PRENORM_IN_USE;
72   }
73 
74   for (p=wlist;p->pos;p++)
75   {
76     p->weight/=NORM_IN_USE;
77   }
78 
79   DBUG_RETURN(wlist);
80 }
81 
ft_boolean_check_syntax_string(const uchar * str)82 my_bool ft_boolean_check_syntax_string(const uchar *str)
83 {
84   uint i, j;
85 
86   if (!str ||
87       (strlen((char*) str)+1 != sizeof(DEFAULT_FTB_SYNTAX)) ||
88       (str[0] != ' ' && str[1] != ' '))
89     return 1;
90   for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++)
91   {
92     /* limiting to 7-bit ascii only */
93     if ((unsigned char)(str[i]) > 127 || isalnum(str[i]))
94       return 1;
95     for (j=0; j<i; j++)
96       if (str[i] == str[j] && (i != 11 || j != 10))
97         return 1;
98   }
99   return 0;
100 }
101 
102 /*
103   RETURN VALUE
104   0 - eof
105   1 - word found
106   2 - left bracket
107   3 - right bracket
108   4 - stopword found
109 */
ft_get_word(CHARSET_INFO * cs,uchar ** start,uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * param)110 uchar ft_get_word(CHARSET_INFO *cs, uchar **start, uchar *end,
111                   FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
112 {
113   uchar *doc=*start;
114   int ctype;
115   uint mwc, length;
116   int mbl;
117 
118   param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
119   param->weight_adjust= param->wasign= 0;
120   param->type= FT_TOKEN_EOF;
121 
122   while (doc<end)
123   {
124     for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
125     {
126       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
127       if (true_word_char(ctype, *doc))
128         break;
129       if (*doc == FTB_RQUOT && param->quot)
130       {
131         *start=doc+1;
132         param->type= FT_TOKEN_RIGHT_PAREN;
133         goto ret;
134       }
135       if (!param->quot)
136       {
137         if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
138         {
139           /* param->prev=' '; */
140           *start=doc+1;
141           if (*doc == FTB_LQUOT)
142             param->quot= (char*) 1;
143           param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
144           goto ret;
145         }
146         if (param->prev == ' ')
147         {
148           if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
149           if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
150           if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
151           if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
152           if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
153           if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
154         }
155       }
156       param->prev=*doc;
157       param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
158       param->weight_adjust= param->wasign= 0;
159     }
160 
161     mwc=length=0;
162     for (word->pos= doc; doc < end; length++,
163          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
164     {
165       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
166       if (true_word_char(ctype, *doc))
167         mwc=0;
168       else if (!misc_word_char(*doc) || mwc)
169         break;
170       else
171         mwc++;
172     }
173     param->prev='A'; /* be sure *prev is true_word_char */
174     word->len= (uint)(doc-word->pos) - mwc;
175     if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
176       doc++;
177 
178     if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
179                                                     word->len))
180          || param->trunc) && length < ft_max_word_len)
181     {
182       *start=doc;
183       param->type= FT_TOKEN_WORD;
184       goto ret;
185     }
186     else if (length) /* make sure length > 0 (if start contains spaces only) */
187     {
188       *start= doc;
189       param->type= FT_TOKEN_STOPWORD;
190       goto ret;
191     }
192   }
193   if (param->quot)
194   {
195     *start= doc;
196     param->type= 3; /* FT_RBR */
197     goto ret;
198   }
199 ret:
200   return param->type;
201 }
202 
ft_simple_get_word(CHARSET_INFO * cs,uchar ** start,const uchar * end,FT_WORD * word,my_bool skip_stopwords)203 uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
204                          FT_WORD *word, my_bool skip_stopwords)
205 {
206   uchar *doc= *start;
207   uint mwc, length;
208   int mbl;
209   int ctype;
210   DBUG_ENTER("ft_simple_get_word");
211 
212   do
213   {
214     for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
215     {
216       if (doc >= end)
217         DBUG_RETURN(0);
218       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
219       if (true_word_char(ctype, *doc))
220         break;
221     }
222 
223     mwc= length= 0;
224     for (word->pos= doc; doc < end; length++,
225          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
226     {
227       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
228       if (true_word_char(ctype, *doc))
229         mwc= 0;
230       else if (!misc_word_char(*doc) || mwc)
231         break;
232       else
233         mwc++;
234     }
235 
236     word->len= (uint)(doc-word->pos) - mwc;
237 
238     if (skip_stopwords == FALSE ||
239         (length >= ft_min_word_len && length < ft_max_word_len &&
240          !is_stopword((char*) word->pos, word->len)))
241     {
242       *start= doc;
243       DBUG_RETURN(1);
244     }
245   } while (doc < end);
246   DBUG_RETURN(0);
247 }
248 
ft_parse_init(TREE * wtree,CHARSET_INFO * cs)249 void ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
250 {
251   DBUG_ENTER("ft_parse_init");
252   if (!is_tree_inited(wtree))
253     init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
254   DBUG_VOID_RETURN;
255 }
256 
257 
ft_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info)258 static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
259                        char *word, int word_len,
260              MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
261 {
262   TREE *wtree;
263   FT_WORD w;
264   MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
265   DBUG_ENTER("ft_add_word");
266   wtree= ft_param->wtree;
267   if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
268   {
269     uchar *ptr;
270     DBUG_ASSERT(wtree->with_delete == 0);
271     ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
272     memcpy(ptr, word, word_len);
273     w.pos= ptr;
274   }
275   else
276     w.pos= (uchar*) word;
277   w.len= word_len;
278   if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
279   {
280     delete_tree(wtree);
281     DBUG_RETURN(1);
282   }
283   DBUG_RETURN(0);
284 }
285 
286 
ft_parse_internal(MYSQL_FTPARSER_PARAM * param,char * doc_arg,int doc_len)287 static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
288                              char *doc_arg, int doc_len)
289 {
290   uchar *doc= (uchar*) doc_arg;
291   uchar *end= doc + doc_len;
292   MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
293   TREE *wtree= ft_param->wtree;
294   FT_WORD w;
295   DBUG_ENTER("ft_parse_internal");
296 
297   while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
298     if (param->mysql_add_word(param, (char*) w.pos, w.len, 0))
299       DBUG_RETURN(1);
300   DBUG_RETURN(0);
301 }
302 
303 
ft_parse(TREE * wtree,uchar * doc,int doclen,struct st_mysql_ftparser * parser,MYSQL_FTPARSER_PARAM * param,MEM_ROOT * mem_root)304 int ft_parse(TREE *wtree, uchar *doc, int doclen,
305              struct st_mysql_ftparser *parser,
306              MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
307 {
308   MY_FT_PARSER_PARAM my_param;
309   DBUG_ENTER("ft_parse");
310   DBUG_ASSERT(parser);
311 
312   my_param.wtree= wtree;
313   my_param.mem_root= mem_root;
314 
315   param->mysql_parse= ft_parse_internal;
316   param->mysql_add_word= ft_add_word;
317   param->mysql_ftparam= &my_param;
318   param->cs= wtree->custom_arg;
319   param->doc= (char*) doc;
320   param->length= doclen;
321   param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
322   DBUG_RETURN(parser->parse(param));
323 }
324 
325 
326 #define MAX_PARAM_NR 2
327 
ftparser_alloc_param(MI_INFO * info)328 MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
329 {
330   if (!info->ftparser_param)
331   {
332     /*
333 .     info->ftparser_param can not be zero after the initialization,
334       because it always includes built-in fulltext parser. And built-in
335       parser can be called even if the table has no fulltext indexes and
336       no varchar/text fields.
337 
338       ftb_find_relevance... parser (ftb_find_relevance_parse,
339       ftb_find_relevance_add_word) calls ftb_check_phrase... parser
340       (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
341     */
342     info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
343       my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
344                 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
345     init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
346   }
347   return info->ftparser_param;
348 }
349 
350 
ftparser_call_initializer(MI_INFO * info,uint keynr,uint paramnr)351 MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
352                                                 uint keynr, uint paramnr)
353 {
354   uint32 ftparser_nr;
355   struct st_mysql_ftparser *parser;
356 
357   if (!ftparser_alloc_param(info))
358     return 0;
359 
360   if (keynr == NO_SUCH_KEY)
361   {
362     ftparser_nr= 0;
363     parser= &ft_default_parser;
364   }
365   else
366   {
367     ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
368     parser= info->s->keyinfo[keynr].parser;
369   }
370   DBUG_ASSERT(paramnr < MAX_PARAM_NR);
371   ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
372   if (! info->ftparser_param[ftparser_nr].mysql_add_word)
373   {
374     /* Note, that mysql_add_word is used here as a flag:
375        mysql_add_word == 0 - parser is not initialized
376        mysql_add_word != 0 - parser is initialized, or no
377                              initialization needed. */
378     info->ftparser_param[ftparser_nr].mysql_add_word=
379       (int (*)(struct st_mysql_ftparser_param *, char *, int,
380               MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
381     if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
382       return 0;
383   }
384   return &info->ftparser_param[ftparser_nr];
385 }
386 
ftparser_call_deinitializer(MI_INFO * info)387 void ftparser_call_deinitializer(MI_INFO *info)
388 {
389   uint i, j, keys= info->s->state.header.keys;
390   free_root(&info->ft_memroot, MYF(0));
391   if (! info->ftparser_param)
392     return;
393   for (i= 0; i < keys; i++)
394   {
395     MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
396     for (j=0; j < MAX_PARAM_NR; j++)
397     {
398       MYSQL_FTPARSER_PARAM *ftparser_param=
399         &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j];
400       if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
401       {
402         if (keyinfo->parser->deinit)
403           keyinfo->parser->deinit(ftparser_param);
404         ftparser_param->mysql_add_word= 0;
405       }
406       else
407         break;
408     }
409   }
410 }
411 
412