1 /* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights
2    reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
23 
24 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
25 
26 #include "ftdefs.h"
27 #include "ctype.h"
28 
29 typedef struct st_ft_docstat {
30   FT_WORD *list;
31   uint uniq;
32   double sum;
33 } FT_DOCSTAT;
34 
35 typedef struct st_my_ft_parser_param
36 {
37   TREE     *wtree;
38   MEM_ROOT *mem_root;
39 } MY_FT_PARSER_PARAM;
40 
FT_WORD_cmp(CHARSET_INFO * cs,FT_WORD * w1,FT_WORD * w2)41 static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
42 {
43   return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
44                          (uchar*) w2->pos, w2->len, 0, 0);
45 }
46 
walk_and_copy(FT_WORD * word,uint32 count,FT_DOCSTAT * docstat)47 static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
48 {
49     word->weight=LWS_IN_USE;
50     docstat->sum+=word->weight;
51     memcpy((docstat->list)++, word, sizeof(FT_WORD));
52     return 0;
53 }
54 
55 /* transforms tree of words into the array, applying normalization */
56 
ft_linearize(TREE * wtree,MEM_ROOT * mem_root)57 FT_WORD * ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
58 {
59   FT_WORD *wlist,*p;
60   FT_DOCSTAT docstat;
61   DBUG_ENTER("ft_linearize");
62 
63   if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
64                                     (1+wtree->elements_in_tree))))
65   {
66     docstat.list=wlist;
67     docstat.uniq=wtree->elements_in_tree;
68     docstat.sum=0;
69     tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
70   }
71   delete_tree(wtree);
72   if (!wlist)
73     DBUG_RETURN(NULL);
74 
75   docstat.list->pos=NULL;
76 
77   for (p=wlist;p->pos;p++)
78   {
79     p->weight=PRENORM_IN_USE;
80   }
81 
82   for (p=wlist;p->pos;p++)
83   {
84     p->weight/=NORM_IN_USE;
85   }
86 
87   DBUG_RETURN(wlist);
88 }
89 
ft_boolean_check_syntax_string(const uchar * str)90 my_bool ft_boolean_check_syntax_string(const uchar *str)
91 {
92   uint i, j;
93 
94   if (!str ||
95       (strlen((char*) str)+1 != sizeof(DEFAULT_FTB_SYNTAX)) ||
96       (str[0] != ' ' && str[1] != ' '))
97     return 1;
98   for (i=0; i<sizeof(DEFAULT_FTB_SYNTAX); i++)
99   {
100     /* limiting to 7-bit ascii only */
101     if ((unsigned char)(str[i]) > 127 || isalnum(str[i]))
102       return 1;
103     for (j=0; j<i; j++)
104       if (str[i] == str[j] && (i != 11 || j != 10))
105         return 1;
106   }
107   return 0;
108 }
109 
110 /*
111   RETURN VALUE
112   0 - eof
113   1 - word found
114   2 - left bracket
115   3 - right bracket
116   4 - stopword found
117 */
ft_get_word(const CHARSET_INFO * cs,uchar ** start,uchar * end,FT_WORD * word,MYSQL_FTPARSER_BOOLEAN_INFO * param)118 uchar ft_get_word(const CHARSET_INFO *cs, uchar **start, uchar *end,
119                   FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
120 {
121   uchar *doc=*start;
122   int ctype;
123   uint mwc, length;
124   int mbl;
125 
126   param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
127   param->weight_adjust= param->wasign= 0;
128   param->type= FT_TOKEN_EOF;
129 
130   while (doc<end)
131   {
132     for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
133     {
134       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
135       if (true_word_char(ctype, *doc))
136         break;
137       if (*doc == FTB_RQUOT && param->quot)
138       {
139         *start=doc+1;
140         param->type= FT_TOKEN_RIGHT_PAREN;
141         goto ret;
142       }
143       if (!param->quot)
144       {
145         if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
146         {
147           /* param->prev=' '; */
148           *start=doc+1;
149           if (*doc == FTB_LQUOT)
150             param->quot= (char*) 1;
151           param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
152           goto ret;
153         }
154         if (param->prev == ' ')
155         {
156           if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
157           if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
158           if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
159           if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
160           if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
161           if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
162         }
163       }
164       param->prev=*doc;
165       param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
166       param->weight_adjust= param->wasign= 0;
167     }
168 
169     mwc=length=0;
170     for (word->pos= doc; doc < end; length++,
171          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
172     {
173       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
174       if (true_word_char(ctype, *doc))
175         mwc=0;
176       else if (!misc_word_char(*doc) || mwc)
177         break;
178       else
179         mwc++;
180     }
181     param->prev='A'; /* be sure *prev is true_word_char */
182     word->len= (uint)(doc-word->pos) - mwc;
183     if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
184       doc++;
185 
186     if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
187                                                     word->len))
188          || param->trunc) && length < ft_max_word_len)
189     {
190       *start=doc;
191       param->type= FT_TOKEN_WORD;
192       goto ret;
193     }
194     else if (length) /* make sure length > 0 (if start contains spaces only) */
195     {
196       *start= doc;
197       param->type= FT_TOKEN_STOPWORD;
198       goto ret;
199     }
200   }
201   if (param->quot)
202   {
203     *start= doc;
204     param->type= 3; /* FT_RBR */
205     goto ret;
206   }
207 ret:
208   return param->type;
209 }
210 
ft_simple_get_word(const CHARSET_INFO * cs,uchar ** start,const uchar * end,FT_WORD * word,my_bool skip_stopwords)211 uchar ft_simple_get_word(const CHARSET_INFO *cs, uchar **start,
212                          const uchar *end,
213                          FT_WORD *word, my_bool skip_stopwords)
214 {
215   uchar *doc= *start;
216   uint mwc, length;
217   int mbl;
218   int ctype;
219   DBUG_ENTER("ft_simple_get_word");
220 
221   do
222   {
223     for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
224     {
225       if (doc >= end)
226         DBUG_RETURN(0);
227       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
228       if (true_word_char(ctype, *doc))
229         break;
230     }
231 
232     mwc= length= 0;
233     for (word->pos= doc; doc < end; length++,
234          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
235     {
236       mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
237       if (true_word_char(ctype, *doc))
238         mwc= 0;
239       else if (!misc_word_char(*doc) || mwc)
240         break;
241       else
242         mwc++;
243     }
244 
245     word->len= (uint)(doc-word->pos) - mwc;
246 
247     if (skip_stopwords == FALSE ||
248         (length >= ft_min_word_len && length < ft_max_word_len &&
249          !is_stopword((char*) word->pos, word->len)))
250     {
251       *start= doc;
252       DBUG_RETURN(1);
253     }
254   } while (doc < end);
255   DBUG_RETURN(0);
256 }
257 
ft_parse_init(TREE * wtree,const CHARSET_INFO * cs)258 void ft_parse_init(TREE *wtree, const CHARSET_INFO *cs)
259 {
260   DBUG_ENTER("ft_parse_init");
261   if (!is_tree_inited(wtree))
262     init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
263   DBUG_VOID_RETURN;
264 }
265 
266 
ft_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))267 static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
268                        char *word, int word_len,
269              MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
270 {
271   TREE *wtree;
272   FT_WORD w;
273   MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
274   DBUG_ENTER("ft_add_word");
275   wtree= ft_param->wtree;
276   if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
277   {
278     uchar *ptr;
279     assert(wtree->with_delete == 0);
280     ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
281     memcpy(ptr, word, word_len);
282     w.pos= ptr;
283   }
284   else
285     w.pos= (uchar*) word;
286   w.len= word_len;
287   if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
288   {
289     delete_tree(wtree);
290     DBUG_RETURN(1);
291   }
292   DBUG_RETURN(0);
293 }
294 
295 
ft_parse_internal(MYSQL_FTPARSER_PARAM * param,char * doc_arg,int doc_len)296 static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
297                              char *doc_arg, int doc_len)
298 {
299   uchar *doc= (uchar*) doc_arg;
300   uchar *end= doc + doc_len;
301   MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
302   TREE *wtree= ft_param->wtree;
303   FT_WORD w;
304   DBUG_ENTER("ft_parse_internal");
305 
306   while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
307     if (param->mysql_add_word(param, (char*) w.pos, w.len, 0))
308       DBUG_RETURN(1);
309   DBUG_RETURN(0);
310 }
311 
312 
ft_parse(TREE * wtree,uchar * doc,int doclen,struct st_mysql_ftparser * parser,MYSQL_FTPARSER_PARAM * param,MEM_ROOT * mem_root)313 int ft_parse(TREE *wtree, uchar *doc, int doclen,
314              struct st_mysql_ftparser *parser,
315              MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
316 {
317   MY_FT_PARSER_PARAM my_param;
318   DBUG_ENTER("ft_parse");
319   assert(parser);
320 
321   my_param.wtree= wtree;
322   my_param.mem_root= mem_root;
323 
324   param->mysql_parse= ft_parse_internal;
325   param->mysql_add_word= ft_add_word;
326   param->mysql_ftparam= &my_param;
327   param->cs= wtree->custom_arg;
328   param->doc= (char*) doc;
329   param->length= doclen;
330   param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
331   DBUG_RETURN(parser->parse(param));
332 }
333 
334 
335 #define MAX_PARAM_NR 2
336 
ftparser_alloc_param(MI_INFO * info)337 MYSQL_FTPARSER_PARAM* ftparser_alloc_param(MI_INFO *info)
338 {
339   if (!info->ftparser_param)
340   {
341     /*
342 .     info->ftparser_param can not be zero after the initialization,
343       because it always includes built-in fulltext parser. And built-in
344       parser can be called even if the table has no fulltext indexes and
345       no varchar/text fields.
346 
347       ftb_find_relevance... parser (ftb_find_relevance_parse,
348       ftb_find_relevance_add_word) calls ftb_check_phrase... parser
349       (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
350     */
351     info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
352       my_malloc(mi_key_memory_FTPARSER_PARAM,
353                 MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
354                 info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
355     init_alloc_root(mi_key_memory_ft_memroot,
356                     &info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
357   }
358   return info->ftparser_param;
359 }
360 
361 
ftparser_call_initializer(MI_INFO * info,uint keynr,uint paramnr)362 MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
363                                                 uint keynr, uint paramnr)
364 {
365   uint32 ftparser_nr;
366   struct st_mysql_ftparser *parser;
367 
368   if (!ftparser_alloc_param(info))
369     return 0;
370 
371   if (keynr == NO_SUCH_KEY)
372   {
373     ftparser_nr= 0;
374     parser= &ft_default_parser;
375   }
376   else
377   {
378     ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
379     parser= info->s->keyinfo[keynr].parser;
380   }
381   assert(paramnr < MAX_PARAM_NR);
382   ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
383   if (! info->ftparser_param[ftparser_nr].mysql_add_word)
384   {
385     /* Note, that mysql_add_word is used here as a flag:
386        mysql_add_word == 0 - parser is not initialized
387        mysql_add_word != 0 - parser is initialized, or no
388                              initialization needed. */
389     info->ftparser_param[ftparser_nr].mysql_add_word=
390       (int (*)(struct st_mysql_ftparser_param *, char *, int,
391               MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
392     if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
393       return 0;
394   }
395   return &info->ftparser_param[ftparser_nr];
396 }
397 
ftparser_call_deinitializer(MI_INFO * info)398 void ftparser_call_deinitializer(MI_INFO *info)
399 {
400   uint i, j, keys= info->s->state.header.keys;
401   free_root(&info->ft_memroot, MYF(0));
402   if (! info->ftparser_param)
403     return;
404   for (i= 0; i < keys; i++)
405   {
406     MI_KEYDEF *keyinfo= &info->s->keyinfo[i];
407     for (j=0; j < MAX_PARAM_NR; j++)
408     {
409       MYSQL_FTPARSER_PARAM *ftparser_param=
410         &info->ftparser_param[keyinfo->ftkey_nr * MAX_PARAM_NR + j];
411       if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
412       {
413         if (keyinfo->parser->deinit)
414           keyinfo->parser->deinit(ftparser_param);
415         ftparser_param->mysql_add_word= 0;
416       }
417       else
418         break;
419     }
420   }
421 }
422 
423