1 /* Copyright (c) 2001, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
24 
25 /*  TODO: add caching - pre-read several index entries at once */
26 
27 /*
28   Added optimization for full-text queries with plus-words. It was
29   implemented by sharing maximal document id (max_docid) variable
30   inside plus subtree. max_docid could be used by any word in plus
31   subtree, but it could be updated by plus-word only.
32 
33   Fulltext "smarter index merge" optimization assumes that rows
34   it gets are ordered by doc_id. That is not the case when we
35   search for a word with truncation operator. It may return
36   rows in random order. Thus we may not use "smarter index merge"
37   optimization with "trunc-words".
38 
39   The idea is: there is no need to search for docid smaller than
40   biggest docid inside current plus subtree or any upper plus subtree.
41 
42   Examples:
43   +word1 word2
44     share same max_docid
45     max_docid updated by word1
46   +word1 +(word2 word3)
47     share same max_docid
48     max_docid updated by word1
49   +(word1 -word2) +(+word3 word4)
50     share same max_docid
51     max_docid updated by word3
52    +word1 word2 (+word3 word4 (+word5 word6))
53     three subexpressions (including the top-level one),
54     every one has its own max_docid, updated by its plus word.
55     but for the search word6 uses
56     max(word1.max_docid, word3.max_docid, word5.max_docid),
57     while word4 uses, accordingly,
58     max(word1.max_docid, word3.max_docid).
59 */
60 
61 #define FT_CORE
62 #include "ftdefs.h"
63 
64 /* search with boolean queries */
65 
66 static double _wghts[11]=
67 {
68   0.131687242798354,
69   0.197530864197531,
70   0.296296296296296,
71   0.444444444444444,
72   0.666666666666667,
73   1.000000000000000,
74   1.500000000000000,
75   2.250000000000000,
76   3.375000000000000,
77   5.062500000000000,
78   7.593750000000000};
79 static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
80 
81 static double _nwghts[11]=
82 {
83  -0.065843621399177,
84  -0.098765432098766,
85  -0.148148148148148,
86  -0.222222222222222,
87  -0.333333333333334,
88  -0.500000000000000,
89  -0.750000000000000,
90  -1.125000000000000,
91  -1.687500000000000,
92  -2.531250000000000,
93  -3.796875000000000};
94 static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
95 
96 #define FTB_FLAG_TRUNC 1
97 /* At most one of the following flags can be set */
98 #define FTB_FLAG_YES   2
99 #define FTB_FLAG_NO    4
100 #define FTB_FLAG_WONLY 8
101 
102 #define CMP_NUM(a,b)    (((a) < (b)) ? -1 : ((a) == (b)) ? 0 : 1)
103 
104 typedef struct st_ftb_expr FTB_EXPR;
105 struct st_ftb_expr
106 {
107   FTB_EXPR *up;
108   uint      flags;
109 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
110   my_off_t  docid[2];
111   my_off_t  max_docid;
112   float     weight;
113   float     cur_weight;
114   LIST     *phrase;               /* phrase words */
115   LIST     *document;             /* for phrase search */
116   uint      yesses;               /* number of "yes" words matched */
117   uint      nos;                  /* number of "no"  words matched */
118   uint      ythresh;              /* number of "yes" words in expr */
119   uint      yweaks;               /* number of "yes" words for scan only */
120 };
121 
122 typedef struct st_ftb_word
123 {
124   FTB_EXPR  *up;
125   uint       flags;
126 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
127   my_off_t   docid[2];             /* for index search and for scan */
128   my_off_t   key_root;
129   FTB_EXPR  *max_docid_expr;
130   MI_KEYDEF *keyinfo;
131   struct st_ftb_word *prev;
132   float      weight;
133   uint       ndepth;
134   uint       len;
135   uchar      off;
136   uchar      word[1];
137 } FTB_WORD;
138 
139 typedef struct st_ft_info
140 {
141   struct _ft_vft *please;
142   MI_INFO   *info;
143   const CHARSET_INFO *charset;
144   FTB_EXPR  *root;
145   FTB_WORD **list;
146   FTB_WORD  *last_word;
147   MEM_ROOT   mem_root;
148   QUEUE      queue;
149   TREE       no_dupes;
150   my_off_t   lastpos;
151   uint       keynr;
152   uchar      with_scan;
153   enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
154 } FTB;
155 
FTB_WORD_cmp(my_off_t * v,FTB_WORD * a,FTB_WORD * b)156 static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
157 {
158   int i;
159 
160   /* if a==curdoc, take it as  a < b */
161   if (v && a->docid[0] == *v)
162     return -1;
163 
164   /* ORDER BY docid, ndepth DESC */
165   i=CMP_NUM(a->docid[0], b->docid[0]);
166   if (!i)
167     i=CMP_NUM(b->ndepth,a->ndepth);
168   return i;
169 }
170 
FTB_WORD_cmp_list(CHARSET_INFO * cs,FTB_WORD ** a,FTB_WORD ** b)171 static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
172 {
173   /* ORDER BY word, ndepth */
174   int i= ha_compare_text(cs, (uchar*) (*a)->word + 1, (*a)->len - 1,
175                              (uchar*) (*b)->word + 1, (*b)->len - 1, 0, 0);
176   if (!i)
177     i= CMP_NUM((*a)->ndepth, (*b)->ndepth);
178   return i;
179 }
180 
181 
182 typedef struct st_my_ftb_param
183 {
184   FTB *ftb;
185   FTB_EXPR *ftbe;
186   uchar *up_quot;
187   uint depth;
188 } MY_FTB_PARAM;
189 
190 
ftb_query_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * info)191 static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
192                               char *word, int word_len,
193                               MYSQL_FTPARSER_BOOLEAN_INFO *info)
194 {
195   MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
196   FTB_WORD *ftbw;
197   FTB_EXPR *ftbe, *tmp_expr;
198   FT_WORD *phrase_word;
199   LIST *tmp_element;
200   int r= info->weight_adjust;
201   float weight= (float)
202         (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
203 
204   switch (info->type) {
205     case FT_TOKEN_WORD:
206       ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
207                                    sizeof(FTB_WORD) +
208                                    (info->trunc ? MI_MAX_KEY_BUFF :
209                                     (word_len + 1) *
210                                     ftb_param->ftb->charset->mbmaxlen +
211                                     HA_FT_WLEN +
212                                     ftb_param->ftb->info->s->rec_reflength));
213       ftbw->len= word_len + 1;
214       ftbw->flags= 0;
215       ftbw->off= 0;
216       if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
217       if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
218       if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
219       ftbw->weight= weight;
220       ftbw->up= ftb_param->ftbe;
221       ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
222       ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
223       ftbw->key_root= HA_OFFSET_ERROR;
224       memcpy(ftbw->word + 1, word, word_len);
225       ftbw->word[0]= word_len;
226       if (info->yesno > 0) ftbw->up->ythresh++;
227       ftb_param->ftb->queue.max_elements++;
228       ftbw->prev= ftb_param->ftb->last_word;
229       ftb_param->ftb->last_word= ftbw;
230       ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
231       for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
232         if (! (tmp_expr->flags & FTB_FLAG_YES))
233           break;
234       ftbw->max_docid_expr= tmp_expr;
235       /* fall through */
236     case FT_TOKEN_STOPWORD:
237       if (! ftb_param->up_quot) break;
238       phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
239       tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
240       phrase_word->pos= (uchar*) word;
241       phrase_word->len= word_len;
242       tmp_element->data= (void *)phrase_word;
243       ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
244       /* Allocate document list at this point.
245          It allows to avoid huge amount of allocs/frees for each row.*/
246       tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
247       tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
248       ftb_param->ftbe->document=
249         list_add(ftb_param->ftbe->document, tmp_element);
250       break;
251     case FT_TOKEN_LEFT_PAREN:
252       ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
253       ftbe->flags= 0;
254       if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
255       if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
256       ftbe->weight= weight;
257       ftbe->up= ftb_param->ftbe;
258       ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
259       ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
260       ftbe->phrase= NULL;
261       ftbe->document= 0;
262       if (info->quot) ftb_param->ftb->with_scan|= 2;
263       if (info->yesno > 0) ftbe->up->ythresh++;
264       ftb_param->ftbe= ftbe;
265       ftb_param->depth++;
266       ftb_param->up_quot= (uchar*) info->quot;
267       break;
268     case FT_TOKEN_RIGHT_PAREN:
269       if (ftb_param->ftbe->document)
270       {
271         /* Circuit document list */
272         for (tmp_element= ftb_param->ftbe->document;
273              tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
274         tmp_element->next= ftb_param->ftbe->document;
275         ftb_param->ftbe->document->prev= tmp_element;
276       }
277       info->quot= 0;
278       if (ftb_param->ftbe->up)
279       {
280         assert(ftb_param->depth);
281         ftb_param->ftbe= ftb_param->ftbe->up;
282         ftb_param->depth--;
283         ftb_param->up_quot= 0;
284       }
285       break;
286     case FT_TOKEN_EOF:
287     default:
288       break;
289   }
290   return(0);
291 }
292 
293 
ftb_parse_query_internal(MYSQL_FTPARSER_PARAM * param,char * query,int len)294 static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
295                                     char *query, int len)
296 {
297   MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
298   MYSQL_FTPARSER_BOOLEAN_INFO info;
299   const CHARSET_INFO *cs= ftb_param->ftb->charset;
300   uchar **start= (uchar**) &query;
301   uchar *end= (uchar*) query + len;
302   FT_WORD w;
303 
304   info.prev= ' ';
305   info.quot= 0;
306   while (ft_get_word(cs, start, end, &w, &info))
307     param->mysql_add_word(param, (char*) w.pos, w.len, &info);
308   return(0);
309 }
310 
311 
_ftb_parse_query(FTB * ftb,uchar * query,uint len,struct st_mysql_ftparser * parser)312 static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
313                             struct st_mysql_ftparser *parser)
314 {
315   MYSQL_FTPARSER_PARAM *param;
316   MY_FTB_PARAM ftb_param;
317   DBUG_ENTER("_ftb_parse_query");
318   assert(parser);
319 
320   if (ftb->state != UNINITIALIZED)
321     DBUG_RETURN(0);
322   if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
323     DBUG_RETURN(1);
324 
325   ftb_param.ftb= ftb;
326   ftb_param.depth= 0;
327   ftb_param.ftbe= ftb->root;
328   ftb_param.up_quot= 0;
329 
330   param->mysql_parse= ftb_parse_query_internal;
331   param->mysql_add_word= ftb_query_add_word;
332   param->mysql_ftparam= (void *)&ftb_param;
333   param->cs= ftb->charset;
334   param->doc= (char*) query;
335   param->length= len;
336   param->flags= 0;
337   param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
338   DBUG_RETURN(parser->parse(param));
339 }
340 
341 
_ftb_no_dupes_cmp(const void * not_used MY_ATTRIBUTE ((unused)),const void * a,const void * b)342 static int _ftb_no_dupes_cmp(const void* not_used MY_ATTRIBUTE((unused)),
343                              const void *a,const void *b)
344 {
345   return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
346 }
347 
348 /*
349   When performing prefix search (a word with truncation operator), we
350   must preserve original prefix to ensure that characters which may be
351   expanded/contracted do not break the prefix. This is done by storing
352   newly found key immediatly after the original word in ftbw->word
353   buffer.
354 
355   ftbw->word= LENGTH WORD [ LENGTH1 WORD1 ] WEIGHT REFERENCE
356   LENGTH - 1 byte, length of the WORD
357   WORD - LENGTH bytes, the word itself
358   LENGTH1 - 1 byte, length of the WORD1, present in case of prefix search
359   WORD1 - LENGTH bytes, the word itself, present in case of prefix search
360   WEIGHT - 4 bytes (HA_FT_WLEN), either weight or number of subkeys
361   REFERENCE - rec_reflength bytes, pointer to the record
362 
363   returns 1 if the search was finished (must-word wasn't found)
364 */
_ft2_search_no_lock(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)365 static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
366 {
367   int r;
368   int subkeys=1;
369   my_bool can_go_down;
370   MI_INFO *info=ftb->info;
371   uint off= 0, extra= HA_FT_WLEN + info->s->rec_reflength;
372   uchar *lastkey_buf=ftbw->word+ftbw->off;
373   uint max_word_length= (ftbw->flags & FTB_FLAG_TRUNC) ? MI_MAX_KEY_BUFF :
374                         ((ftbw->len) * ftb->charset->mbmaxlen) + extra;
375 
376   if (ftbw->flags & FTB_FLAG_TRUNC)
377     lastkey_buf+=ftbw->len;
378 
379   if (init_search)
380   {
381     ftbw->key_root=info->s->state.key_root[ftb->keynr];
382     ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
383 
384     r=_mi_search(info, ftbw->keyinfo, (uchar*) ftbw->word, ftbw->len,
385                  SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
386   }
387   else
388   {
389     uint sflag= SEARCH_BIGGER;
390     my_off_t max_docid=0;
391     FTB_EXPR *tmp;
392 
393     for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
394       set_if_bigger(max_docid, tmp->max_docid);
395 
396     if (ftbw->docid[0] < max_docid)
397     {
398       sflag|= SEARCH_SAME;
399       _mi_dpointer(info, (uchar*) (lastkey_buf + HA_FT_WLEN +
400                                    (ftbw->off ? 0 : lastkey_buf[0] + 1)),
401                    max_docid);
402     }
403     r=_mi_search(info, ftbw->keyinfo, (uchar*) lastkey_buf,
404                    USE_WHOLE_KEY, sflag, ftbw->key_root);
405   }
406 
407   can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
408   /* Skip rows inserted by concurrent insert */
409   while (!r)
410   {
411     if (can_go_down)
412     {
413       /* going down ? */
414       off=info->lastkey_length-extra;
415       subkeys=ft_sintXkorr(info->lastkey+off);
416     }
417     if (subkeys<0 || info->lastpos < info->state->data_file_length)
418       break;
419     r= _mi_search_next(info, ftbw->keyinfo, info->lastkey,
420                        info->lastkey_length,
421 		       SEARCH_BIGGER, ftbw->key_root);
422   }
423 
424   if (!r && !ftbw->off)
425   {
426     r= ha_compare_text(ftb->charset,
427                        info->lastkey+1,
428                        info->lastkey_length-extra-1,
429               (uchar*) ftbw->word+1,
430                        ftbw->len-1,
431              (my_bool) (ftbw->flags & FTB_FLAG_TRUNC),0);
432   }
433 
434   if (r || info->lastkey_length > max_word_length) /* not found */
435   {
436     if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
437     {
438       ftbw->docid[0]=HA_OFFSET_ERROR;
439       if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
440       {
441         /*
442           This word MUST BE present in every document returned,
443           so we can stop the search right now
444         */
445         ftb->state=INDEX_DONE;
446         return 1; /* search is done */
447       }
448       else
449         return 0;
450     }
451 
452     /*
453       Going up to the first-level tree to continue search there.
454       Only done when performing prefix search.
455 
456       Key buffer data pointer as well as docid[0] may be smaller
457       than values we got while searching first-level tree. Thus
458       they must be restored to original values to avoid dead-loop,
459       when subsequent search for a bigger value eventually ends up
460       in this same second-level tree.
461     */
462     _mi_dpointer(info, (uchar*) (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
463     ftbw->docid[0]= ftbw->key_root;
464     ftbw->key_root=info->s->state.key_root[ftb->keynr];
465     ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
466     ftbw->off=0;
467     return _ft2_search_no_lock(ftb, ftbw, 0);
468   }
469 
470   /* matching key found */
471   memcpy(lastkey_buf, info->lastkey, info->lastkey_length);
472   if (lastkey_buf == ftbw->word)
473     ftbw->len=info->lastkey_length-extra;
474 
475   /* going down ? */
476   if (subkeys<0)
477   {
478     /*
479       yep, going down, to the second-level tree
480       TODO here: subkey-based optimization
481     */
482     ftbw->off=off;
483     ftbw->key_root=info->lastpos;
484     ftbw->keyinfo=& info->s->ft2_keyinfo;
485     r=_mi_search_first(info, ftbw->keyinfo, ftbw->key_root);
486     assert(r==0);  /* found something */
487     memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length);
488   }
489   ftbw->docid[0]=info->lastpos;
490   if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
491     ftbw->max_docid_expr->max_docid= info->lastpos;
492   return 0;
493 }
494 
_ft2_search(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)495 static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
496 {
497   int r;
498   MYISAM_SHARE *share= ftb->info->s;
499   if (share->concurrent_insert)
500     mysql_rwlock_rdlock(&share->key_root_lock[ftb->keynr]);
501   r= _ft2_search_no_lock(ftb, ftbw, init_search);
502   if (share->concurrent_insert)
503     mysql_rwlock_unlock(&share->key_root_lock[ftb->keynr]);
504   return r;
505 }
506 
_ftb_init_index_search(FT_INFO * ftb)507 static void _ftb_init_index_search(FT_INFO *ftb)
508 {
509   int i;
510   FTB_WORD   *ftbw;
511 
512   if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
513     return;
514   ftb->state=INDEX_SEARCH;
515 
516   for (i=ftb->queue.elements; i; i--)
517   {
518     ftbw=(FTB_WORD *)(ftb->queue.root[i]);
519 
520     if (ftbw->flags & FTB_FLAG_TRUNC)
521     {
522       /*
523         special treatment for truncation operator
524         1. there are some (besides this) +words
525            | no need to search in the index, it can never ADD new rows
526            | to the result, and to remove half-matched rows we do scan anyway
527         2. -trunc*
528            | same as 1.
529         3. in 1 and 2, +/- need not be on the same expr. level,
530            but can be on any upper level, as in +word +(trunc1* trunc2*)
531         4. otherwise
532            | We have to index-search for this prefix.
533            | It may cause duplicates, as in the index (sorted by <word,docid>)
534            |   <aaaa,row1>
535            |   <aabb,row2>
536            |   <aacc,row1>
537            | Searching for "aa*" will find row1 twice...
538       */
539       FTB_EXPR *ftbe;
540       for (ftbe=(FTB_EXPR*)ftbw;
541            ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
542            ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
543       {
544         if (ftbe->flags & FTB_FLAG_NO ||                     /* 2 */
545             ftbe->up->ythresh - ftbe->up->yweaks >
546             (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES))      /* 1 */
547         {
548           FTB_EXPR *top_ftbe=ftbe->up;
549           ftbw->docid[0]=HA_OFFSET_ERROR;
550           for (ftbe=(FTB_EXPR *)ftbw;
551                ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
552                ftbe=ftbe->up)
553               ftbe->up->yweaks++;
554           ftbe=0;
555           break;
556         }
557       }
558       if (!ftbe)
559         continue;
560       /* 4 */
561       if (!is_tree_inited(& ftb->no_dupes))
562         init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
563             _ftb_no_dupes_cmp,0,0,0);
564       else
565         reset_tree(& ftb->no_dupes);
566     }
567 
568     ftbw->off=0; /* in case of reinit */
569     if (_ft2_search(ftb, ftbw, 1))
570       return;
571   }
572   queue_fix(& ftb->queue);
573 }
574 
575 
ft_init_boolean_search(MI_INFO * info,uint keynr,uchar * query,uint query_len,const CHARSET_INFO * cs)576 FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
577                                  uint query_len, const CHARSET_INFO *cs)
578 {
579   FTB       *ftb;
580   FTB_EXPR  *ftbe;
581   FTB_WORD  *ftbw;
582 
583   if (!(ftb=(FTB *)my_malloc(mi_key_memory_FTB,
584                              sizeof(FTB), MYF(MY_WME))))
585     return 0;
586   ftb->please= (struct _ft_vft *) & _ft_vft_boolean;
587   ftb->state=UNINITIALIZED;
588   ftb->info=info;
589   ftb->keynr=keynr;
590   ftb->charset=cs;
591   assert(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
592   ftb->with_scan=0;
593   ftb->lastpos=HA_OFFSET_ERROR;
594   memset(&ftb->no_dupes, 0, sizeof(TREE));
595   ftb->last_word= 0;
596 
597   init_alloc_root(PSI_INSTRUMENT_ME, &ftb->mem_root, 1024, 1024);
598   ftb->queue.max_elements= 0;
599   if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
600     goto err;
601   ftbe->weight=1;
602   ftbe->flags=FTB_FLAG_YES;
603   ftbe->nos=1;
604   ftbe->up=0;
605   ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
606   ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
607   ftbe->phrase= NULL;
608   ftbe->document= 0;
609   ftb->root=ftbe;
610   if (unlikely(_ftb_parse_query(ftb, query, query_len,
611                                 keynr == NO_SUCH_KEY ? &ft_default_parser :
612                                 info->s->keyinfo[keynr].parser)))
613     goto err;
614   /*
615     Hack: instead of init_queue, we'll use reinit queue to be able
616     to alloc queue with alloc_root()
617   */
618   if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
619                                               (ftb->queue.max_elements + 1) *
620                                               sizeof(void *))))
621     goto err;
622   reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
623                          (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
624   for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
625     queue_insert(&ftb->queue, (uchar *)ftbw);
626   ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
627                                      sizeof(FTB_WORD *)*ftb->queue.elements);
628   memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
629   my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
630             (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
631   if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
632   ftb->state=READY;
633   return ftb;
634 err:
635   free_root(& ftb->mem_root, MYF(0));
636   my_free(ftb);
637   return 0;
638 }
639 
640 
641 typedef struct st_my_ftb_phrase_param
642 {
643   LIST *phrase;
644   LIST *document;
645   const CHARSET_INFO *cs;
646   uint phrase_length;
647   uint document_length;
648   uint match;
649 } MY_FTB_PHRASE_PARAM;
650 
651 
ftb_phrase_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))652 static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
653                                char *word, int word_len,
654     MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
655 {
656   MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
657   FT_WORD *w= (FT_WORD *)phrase_param->document->data;
658   LIST *phrase, *document;
659   w->pos= (uchar*) word;
660   w->len= word_len;
661   phrase_param->document= phrase_param->document->prev;
662   if (phrase_param->phrase_length > phrase_param->document_length)
663   {
664     phrase_param->document_length++;
665     return 0;
666   }
667   /* TODO: rewrite phrase search to avoid
668      comparing the same word twice. */
669   for (phrase= phrase_param->phrase, document= phrase_param->document->next;
670        phrase; phrase= phrase->next, document= document->next)
671   {
672     FT_WORD *phrase_word= (FT_WORD *)phrase->data;
673     FT_WORD *document_word= (FT_WORD *)document->data;
674     if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
675                      phrase_word->len,
676                      (uchar*) document_word->pos, document_word->len))
677       return 0;
678   }
679   phrase_param->match++;
680   return 0;
681 }
682 
683 
ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM * param,char * document,int len)684 static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
685                                      char *document, int len)
686 {
687   FT_WORD word;
688   MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
689   const uchar *docend= (uchar*) document + len;
690   while (ft_simple_get_word(phrase_param->cs, (uchar**) &document, docend,
691                             &word, FALSE))
692   {
693     param->mysql_add_word(param, (char*) word.pos, word.len, 0);
694     if (phrase_param->match)
695       break;
696   }
697   return 0;
698 }
699 
700 
701 /*
702   Checks if given buffer matches phrase list.
703 
704   SYNOPSIS
705     _ftb_check_phrase()
706     s0     start of buffer
707     e0     end of buffer
708     phrase broken into list phrase
709     cs     charset info
710 
711   RETURN VALUE
712     1 is returned if phrase found, 0 else.
713     -1 is returned if error occurs.
714 */
715 
_ftb_check_phrase(FTB * ftb,const uchar * document,uint len,FTB_EXPR * ftbe,struct st_mysql_ftparser * parser)716 static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
717                 FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
718 {
719   MY_FTB_PHRASE_PARAM ftb_param;
720   MYSQL_FTPARSER_PARAM *param;
721   DBUG_ENTER("_ftb_check_phrase");
722   assert(parser);
723 
724   if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
725     DBUG_RETURN(0);
726 
727   ftb_param.phrase= ftbe->phrase;
728   ftb_param.document= ftbe->document;
729   ftb_param.cs= ftb->charset;
730   ftb_param.phrase_length= list_length(ftbe->phrase);
731   ftb_param.document_length= 1;
732   ftb_param.match= 0;
733 
734   param->mysql_parse= ftb_check_phrase_internal;
735   param->mysql_add_word= ftb_phrase_add_word;
736   param->mysql_ftparam= (void *)&ftb_param;
737   param->cs= ftb->charset;
738   param->doc= (char *) document;
739   param->length= len;
740   param->flags= 0;
741   param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
742   if (unlikely(parser->parse(param)))
743     DBUG_RETURN(-1);
744   DBUG_RETURN(ftb_param.match ? 1 : 0);
745 }
746 
747 
_ftb_climb_the_tree(FTB * ftb,FTB_WORD * ftbw,FT_SEG_ITERATOR * ftsi_orig)748 static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
749 {
750   FT_SEG_ITERATOR ftsi;
751   FTB_EXPR *ftbe;
752   float weight=ftbw->weight;
753   int  yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
754   my_off_t curdoc=ftbw->docid[mode];
755   struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
756                                     &ft_default_parser :
757                                     ftb->info->s->keyinfo[ftb->keynr].parser;
758 
759   for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
760   {
761     ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
762     if (ftbe->docid[mode] != curdoc)
763     {
764       ftbe->cur_weight=0;
765       ftbe->yesses=ftbe->nos=0;
766       ftbe->docid[mode]=curdoc;
767     }
768     if (ftbe->nos)
769       break;
770     if (yn_flag & FTB_FLAG_YES)
771     {
772       weight /= ftbe->ythresh;
773       ftbe->cur_weight += weight;
774       if ((int) ++ftbe->yesses == ythresh)
775       {
776         yn_flag=ftbe->flags;
777         weight=ftbe->cur_weight*ftbe->weight;
778         if (mode && ftbe->phrase)
779         {
780           int found= 0;
781 
782           memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
783           while (_mi_ft_segiterator(&ftsi) && !found)
784           {
785             if (!ftsi.pos)
786               continue;
787             found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
788             if (unlikely(found < 0))
789               return 1;
790           }
791           if (!found)
792             break;
793         } /* ftbe->quot */
794       }
795       else
796         break;
797     }
798     else
799     if (yn_flag & FTB_FLAG_NO)
800     {
801       /*
802         NOTE: special sort function of queue assures that all
803         (yn_flag & FTB_FLAG_NO) != 0
804         events for every particular subexpression will
805         "auto-magically" happen BEFORE all the
806         (yn_flag & FTB_FLAG_YES) != 0 events. So no
807         already matched expression can become not-matched again.
808       */
809       ++ftbe->nos;
810       break;
811     }
812     else
813     {
814       if (ftbe->ythresh)
815         weight/=3;
816       ftbe->cur_weight +=  weight;
817       if ((int) ftbe->yesses < ythresh)
818         break;
819       if (!(yn_flag & FTB_FLAG_WONLY))
820         yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
821       weight*= ftbe->weight;
822     }
823   }
824   return 0;
825 }
826 
827 
ft_boolean_read_next(FT_INFO * ftb,char * record)828 int ft_boolean_read_next(FT_INFO *ftb, char *record)
829 {
830   FTB_EXPR  *ftbe;
831   FTB_WORD  *ftbw;
832   MI_INFO   *info=ftb->info;
833   my_off_t   curdoc;
834 
835   if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
836     return -1;
837 
838   /* black magic ON */
839   if ((int) _mi_check_index(info, ftb->keynr) < 0)
840     return my_errno();
841   if (_mi_readinfo(info, F_RDLCK, 1))
842     return my_errno();
843   /* black magic OFF */
844 
845   if (!ftb->queue.elements)
846   {
847     set_my_errno(HA_ERR_END_OF_FILE);
848     return HA_ERR_END_OF_FILE;
849   }
850 
851   /* Attention!!! Address of a local variable is used here! See err: label */
852   ftb->queue.first_cmp_arg=(void *)&curdoc;
853 
854   while (ftb->state == INDEX_SEARCH &&
855          (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
856          HA_OFFSET_ERROR)
857   {
858     while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
859     {
860       if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
861       {
862         set_my_errno(HA_ERR_OUT_OF_MEM);
863         goto err;
864       }
865 
866       /* update queue */
867       _ft2_search(ftb, ftbw, 0);
868       queue_replaced(& ftb->queue);
869     }
870 
871     ftbe=ftb->root;
872     if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
873         ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
874     {
875       /* curdoc matched ! */
876       if (is_tree_inited(&ftb->no_dupes) &&
877           tree_insert(&ftb->no_dupes, &curdoc, 0,
878                       ftb->no_dupes.custom_arg)->count >1)
879         /* but it managed already to get past this line once */
880         continue;
881 
882       info->lastpos=curdoc;
883       /* Clear all states, except that the table was updated */
884       info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
885 
886       if (!(*info->read_record)(info,curdoc, (uchar*) record))
887       {
888         info->update|= HA_STATE_AKTIV;          /* Record is read */
889         if (ftb->with_scan &&
890             ft_boolean_find_relevance(ftb,(uchar*) record,0)==0)
891             continue; /* no match */
892         set_my_errno(0);
893         goto err;
894       }
895       goto err;
896     }
897   }
898   ftb->state=INDEX_DONE;
899   set_my_errno(HA_ERR_END_OF_FILE);
900 err:
901   ftb->queue.first_cmp_arg=(void *)0;
902   return my_errno();
903 }
904 
905 
906 typedef struct st_my_ftb_find_param
907 {
908   FT_INFO *ftb;
909   FT_SEG_ITERATOR *ftsi;
910 } MY_FTB_FIND_PARAM;
911 
912 
ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))913 static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
914                                        char *word, int len,
915              MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
916 {
917   MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
918   FT_INFO *ftb= ftb_param->ftb;
919   FTB_WORD *ftbw;
920   int a, b, c;
921   /*
922     Find right-most element in the array of query words matching this
923     word from a document.
924   */
925   for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
926   {
927     ftbw= ftb->list[c];
928     if (ha_compare_text(ftb->charset, (uchar*)word, len,
929                         (uchar*)ftbw->word+1, ftbw->len-1,
930                         (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0) < 0)
931       b= c;
932     else
933       a= c;
934   }
935   /*
936     If there were no words with truncation operator, we iterate to the
937     beginning of an array until array element is equal to the word from
938     a document. This is done mainly because the same word may be
939     mentioned twice (or more) in the query.
940 
941     In case query has words with truncation operator we must iterate
942     to the beginning of the array. There may be non-matching query words
943     between matching word with truncation operator and the right-most
944     matching element. E.g., if we're looking for 'aaa15' in an array of
945     'aaa1* aaa14 aaa15 aaa16'.
946 
947     Worse of that there still may be match even if the binary search
948     above didn't find matching element. E.g., if we're looking for
949     'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
950     stop at 'aaa16'.
951   */
952   for (; c >= 0; c--)
953   {
954     ftbw= ftb->list[c];
955     if (ha_compare_text(ftb->charset, (uchar*)word, len,
956                         (uchar*)ftbw->word + 1,ftbw->len - 1,
957                         (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
958     {
959       if (ftb->with_scan & FTB_FLAG_TRUNC)
960         continue;
961       else
962         break;
963     }
964     if (ftbw->docid[1] == ftb->info->lastpos)
965       continue;
966     ftbw->docid[1]= ftb->info->lastpos;
967     if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
968       return 1;
969   }
970   return(0);
971 }
972 
973 
ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM * param,char * doc,int len)974 static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
975                                     char *doc, int len)
976 {
977   MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
978   FT_INFO *ftb= ftb_param->ftb;
979   uchar *end= (uchar*) doc + len;
980   FT_WORD w;
981   while (ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE))
982     param->mysql_add_word(param, (char*) w.pos, w.len, 0);
983   return(0);
984 }
985 
986 
ft_boolean_find_relevance(FT_INFO * ftb,uchar * record,uint length)987 float ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
988 {
989   FTB_EXPR *ftbe;
990   FT_SEG_ITERATOR ftsi, ftsi2;
991   my_off_t  docid=ftb->info->lastpos;
992   MY_FTB_FIND_PARAM ftb_param;
993   MYSQL_FTPARSER_PARAM *param;
994   struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
995                                     &ft_default_parser :
996                                     ftb->info->s->keyinfo[ftb->keynr].parser;
997 
998   if (docid == HA_OFFSET_ERROR)
999     return -2.0;
1000   if (!ftb->queue.elements)
1001     return 0;
1002   if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
1003     return 0;
1004 
1005   if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
1006   {
1007     FTB_EXPR *x;
1008     uint i;
1009 
1010     for (i=0; i < ftb->queue.elements; i++)
1011     {
1012       ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
1013       for (x=ftb->list[i]->up; x; x=x->up)
1014         x->docid[1]=HA_OFFSET_ERROR;
1015     }
1016   }
1017 
1018   ftb->lastpos=docid;
1019 
1020   if (ftb->keynr==NO_SUCH_KEY)
1021     _mi_ft_segiterator_dummy_init(record, length, &ftsi);
1022   else
1023     _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
1024   memcpy(&ftsi2, &ftsi, sizeof(ftsi));
1025 
1026   ftb_param.ftb= ftb;
1027   ftb_param.ftsi= &ftsi2;
1028   param->mysql_parse= ftb_find_relevance_parse;
1029   param->mysql_add_word= ftb_find_relevance_add_word;
1030   param->mysql_ftparam= (void *)&ftb_param;
1031   param->flags= 0;
1032   param->cs= ftb->charset;
1033   param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
1034   while (_mi_ft_segiterator(&ftsi))
1035   {
1036     if (!ftsi.pos)
1037       continue;
1038     param->doc= (char *)ftsi.pos;
1039     param->length= ftsi.len;
1040     if (unlikely(parser->parse(param)))
1041       return 0;
1042   }
1043   ftbe=ftb->root;
1044   if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
1045       ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
1046   { /* row matched ! */
1047     return ftbe->cur_weight;
1048   }
1049   else
1050   { /* match failed ! */
1051     return 0.0;
1052   }
1053 }
1054 
1055 
ft_boolean_close_search(FT_INFO * ftb)1056 void ft_boolean_close_search(FT_INFO *ftb)
1057 {
1058   if (is_tree_inited(& ftb->no_dupes))
1059   {
1060     delete_tree(& ftb->no_dupes);
1061   }
1062   free_root(& ftb->mem_root, MYF(0));
1063   my_free(ftb);
1064 }
1065 
1066 
ft_boolean_get_relevance(FT_INFO * ftb)1067 float ft_boolean_get_relevance(FT_INFO *ftb)
1068 {
1069   return ftb->root->cur_weight;
1070 }
1071 
1072 
ft_boolean_reinit_search(FT_INFO * ftb)1073 void ft_boolean_reinit_search(FT_INFO *ftb)
1074 {
1075   _ftb_init_index_search(ftb);
1076 }
1077 
1078