1 /* Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
24 
25 /*  TODO: add caching - pre-read several index entries at once */
26 
27 /*
28   Added optimization for full-text queries with plus-words. It was
29   implemented by sharing maximal document id (max_docid) variable
30   inside plus subtree. max_docid could be used by any word in plus
31   subtree, but it could be updated by plus-word only.
32 
33   Fulltext "smarter index merge" optimization assumes that rows
34   it gets are ordered by doc_id. That is not the case when we
35   search for a word with truncation operator. It may return
36   rows in random order. Thus we may not use "smarter index merge"
37   optimization with "trunc-words".
38 
39   The idea is: there is no need to search for docid smaller than
40   biggest docid inside current plus subtree or any upper plus subtree.
41 
42   Examples:
43   +word1 word2
44     share same max_docid
45     max_docid updated by word1
46   +word1 +(word2 word3)
47     share same max_docid
48     max_docid updated by word1
49   +(word1 -word2) +(+word3 word4)
50     share same max_docid
51     max_docid updated by word3
52    +word1 word2 (+word3 word4 (+word5 word6))
53     three subexpressions (including the top-level one),
54     every one has its own max_docid, updated by its plus word.
55     but for the search word6 uses
56     max(word1.max_docid, word3.max_docid, word5.max_docid),
57     while word4 uses, accordingly,
58     max(word1.max_docid, word3.max_docid).
59 */
60 
61 #define FT_CORE
62 #include "ftdefs.h"
63 
64 /* search with boolean queries */
65 
66 static double _wghts[11]=
67 {
68   0.131687242798354,
69   0.197530864197531,
70   0.296296296296296,
71   0.444444444444444,
72   0.666666666666667,
73   1.000000000000000,
74   1.500000000000000,
75   2.250000000000000,
76   3.375000000000000,
77   5.062500000000000,
78   7.593750000000000};
79 static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
80 
81 static double _nwghts[11]=
82 {
83  -0.065843621399177,
84  -0.098765432098766,
85  -0.148148148148148,
86  -0.222222222222222,
87  -0.333333333333334,
88  -0.500000000000000,
89  -0.750000000000000,
90  -1.125000000000000,
91  -1.687500000000000,
92  -2.531250000000000,
93  -3.796875000000000};
94 static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
95 
96 #define FTB_FLAG_TRUNC 1
97 /* At most one of the following flags can be set */
98 #define FTB_FLAG_YES   2
99 #define FTB_FLAG_NO    4
100 #define FTB_FLAG_WONLY 8
101 
102 #define CMP_NUM(a,b)    (((a) < (b)) ? -1 : ((a) == (b)) ? 0 : 1)
103 
104 typedef struct st_ftb_expr FTB_EXPR;
105 struct st_ftb_expr
106 {
107   FTB_EXPR *up;
108   uint      flags;
109 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
110   my_off_t  docid[2];
111   my_off_t  max_docid;
112   float     weight;
113   float     cur_weight;
114   LIST     *phrase;               /* phrase words */
115   LIST     *document;             /* for phrase search */
116   uint      yesses;               /* number of "yes" words matched */
117   uint      nos;                  /* number of "no"  words matched */
118   uint      ythresh;              /* number of "yes" words in expr */
119   uint      yweaks;               /* number of "yes" words for scan only */
120 };
121 
122 typedef struct st_ftb_word
123 {
124   FTB_EXPR  *up;
125   uint       flags;
126 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
127   my_off_t   docid[2];             /* for index search and for scan */
128   my_off_t   key_root;
129   FTB_EXPR  *max_docid_expr;
130   MI_KEYDEF *keyinfo;
131   struct st_ftb_word *prev;
132   float      weight;
133   uint       ndepth;
134   uint       len;
135   uchar      off;
136   uchar      word[1];
137 } FTB_WORD;
138 
139 typedef struct st_ft_info
140 {
141   struct _ft_vft *please;
142   MI_INFO   *info;
143   const CHARSET_INFO *charset;
144   FTB_EXPR  *root;
145   FTB_WORD **list;
146   FTB_WORD  *last_word;
147   MEM_ROOT   mem_root;
148   QUEUE      queue;
149   TREE       no_dupes;
150   my_off_t   lastpos;
151   uint       keynr;
152   uchar      with_scan;
153   enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
154 } FTB;
155 
FTB_WORD_cmp(my_off_t * v,FTB_WORD * a,FTB_WORD * b)156 static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
157 {
158   int i;
159 
160   /* if a==curdoc, take it as  a < b */
161   if (v && a->docid[0] == *v)
162     return -1;
163 
164   /* ORDER BY docid, ndepth DESC */
165   i=CMP_NUM(a->docid[0], b->docid[0]);
166   if (!i)
167     i=CMP_NUM(b->ndepth,a->ndepth);
168   return i;
169 }
170 
FTB_WORD_cmp_list(CHARSET_INFO * cs,FTB_WORD ** a,FTB_WORD ** b)171 static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
172 {
173   /* ORDER BY word, ndepth */
174   int i= ha_compare_text(cs, (uchar*) (*a)->word + 1, (*a)->len - 1,
175                              (uchar*) (*b)->word + 1, (*b)->len - 1, 0, 0);
176   if (!i)
177     i= CMP_NUM((*a)->ndepth, (*b)->ndepth);
178   return i;
179 }
180 
181 
182 typedef struct st_my_ftb_param
183 {
184   FTB *ftb;
185   FTB_EXPR *ftbe;
186   uchar *up_quot;
187   uint depth;
188 } MY_FTB_PARAM;
189 
190 
ftb_query_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * info)191 static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
192                               char *word, int word_len,
193                               MYSQL_FTPARSER_BOOLEAN_INFO *info)
194 {
195   MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
196   FTB_WORD *ftbw;
197   FTB_EXPR *ftbe, *tmp_expr;
198   FT_WORD *phrase_word;
199   LIST *tmp_element;
200   int r= info->weight_adjust;
201   float weight= (float)
202         (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
203 
204   switch (info->type) {
205     case FT_TOKEN_WORD:
206       ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
207                                    sizeof(FTB_WORD) +
208                                    (info->trunc ? MI_MAX_KEY_BUFF :
209                                     (word_len + 1) *
210                                     ftb_param->ftb->charset->mbmaxlen +
211                                     HA_FT_WLEN +
212                                     ftb_param->ftb->info->s->rec_reflength));
213       ftbw->len= word_len + 1;
214       ftbw->flags= 0;
215       ftbw->off= 0;
216       if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
217       if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
218       if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
219       ftbw->weight= weight;
220       ftbw->up= ftb_param->ftbe;
221       ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
222       ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
223       ftbw->key_root= HA_OFFSET_ERROR;
224       memcpy(ftbw->word + 1, word, word_len);
225       ftbw->word[0]= word_len;
226       if (info->yesno > 0) ftbw->up->ythresh++;
227       ftb_param->ftb->queue.max_elements++;
228       ftbw->prev= ftb_param->ftb->last_word;
229       ftb_param->ftb->last_word= ftbw;
230       ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
231       for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
232         if (! (tmp_expr->flags & FTB_FLAG_YES))
233           break;
234       ftbw->max_docid_expr= tmp_expr;
235       /* fall through */
236     case FT_TOKEN_STOPWORD:
237       if (! ftb_param->up_quot) break;
238       phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
239       tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
240       phrase_word->pos= (uchar*) word;
241       phrase_word->len= word_len;
242       tmp_element->data= (void *)phrase_word;
243       ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
244       /* Allocate document list at this point.
245          It allows to avoid huge amount of allocs/frees for each row.*/
246       tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
247       tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
248       ftb_param->ftbe->document=
249         list_add(ftb_param->ftbe->document, tmp_element);
250       break;
251     case FT_TOKEN_LEFT_PAREN:
252       ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
253       ftbe->flags= 0;
254       if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
255       if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
256       ftbe->weight= weight;
257       ftbe->up= ftb_param->ftbe;
258       ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
259       ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
260       ftbe->phrase= NULL;
261       ftbe->document= 0;
262       if (info->quot) ftb_param->ftb->with_scan|= 2;
263       if (info->yesno > 0) ftbe->up->ythresh++;
264       ftb_param->ftbe= ftbe;
265       ftb_param->depth++;
266       ftb_param->up_quot= (uchar*) info->quot;
267       break;
268     case FT_TOKEN_RIGHT_PAREN:
269       if (ftb_param->ftbe->document)
270       {
271         /* Circuit document list */
272         for (tmp_element= ftb_param->ftbe->document;
273              tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
274         tmp_element->next= ftb_param->ftbe->document;
275         ftb_param->ftbe->document->prev= tmp_element;
276       }
277       info->quot= 0;
278       if (ftb_param->ftbe->up)
279       {
280         DBUG_ASSERT(ftb_param->depth);
281         ftb_param->ftbe= ftb_param->ftbe->up;
282         ftb_param->depth--;
283         ftb_param->up_quot= 0;
284       }
285       break;
286     case FT_TOKEN_EOF:
287     default:
288       break;
289   }
290   return(0);
291 }
292 
293 
ftb_parse_query_internal(MYSQL_FTPARSER_PARAM * param,char * query,int len)294 static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
295                                     char *query, int len)
296 {
297   MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
298   MYSQL_FTPARSER_BOOLEAN_INFO info;
299   const CHARSET_INFO *cs= ftb_param->ftb->charset;
300   uchar **start= (uchar**) &query;
301   uchar *end= (uchar*) query + len;
302   FT_WORD w;
303 
304   info.prev= ' ';
305   info.quot= 0;
306   while (ft_get_word(cs, start, end, &w, &info))
307     param->mysql_add_word(param, (char*) w.pos, w.len, &info);
308   return(0);
309 }
310 
311 
_ftb_parse_query(FTB * ftb,uchar * query,uint len,struct st_mysql_ftparser * parser)312 static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
313                             struct st_mysql_ftparser *parser)
314 {
315   MYSQL_FTPARSER_PARAM *param;
316   MY_FTB_PARAM ftb_param;
317   DBUG_ENTER("_ftb_parse_query");
318   DBUG_ASSERT(parser);
319 
320   if (ftb->state != UNINITIALIZED)
321     DBUG_RETURN(0);
322   if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
323     DBUG_RETURN(1);
324 
325   ftb_param.ftb= ftb;
326   ftb_param.depth= 0;
327   ftb_param.ftbe= ftb->root;
328   ftb_param.up_quot= 0;
329 
330   param->mysql_parse= ftb_parse_query_internal;
331   param->mysql_add_word= ftb_query_add_word;
332   param->mysql_ftparam= (void *)&ftb_param;
333   param->cs= ftb->charset;
334   param->doc= (char*) query;
335   param->length= len;
336   param->flags= 0;
337   param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
338   DBUG_RETURN(parser->parse(param));
339 }
340 
341 
_ftb_no_dupes_cmp(const void * not_used MY_ATTRIBUTE ((unused)),const void * a,const void * b)342 static int _ftb_no_dupes_cmp(const void* not_used MY_ATTRIBUTE((unused)),
343                              const void *a,const void *b)
344 {
345   return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
346 }
347 
348 /*
349   When performing prefix search (a word with truncation operator), we
350   must preserve original prefix to ensure that characters which may be
351   expanded/contracted do not break the prefix. This is done by storing
352   newly found key immediatly after the original word in ftbw->word
353   buffer.
354 
355   ftbw->word= LENGTH WORD [ LENGTH1 WORD1 ] WEIGHT REFERENCE
356   LENGTH - 1 byte, length of the WORD
357   WORD - LENGTH bytes, the word itself
358   LENGTH1 - 1 byte, length of the WORD1, present in case of prefix search
359   WORD1 - LENGTH bytes, the word itself, present in case of prefix search
360   WEIGHT - 4 bytes (HA_FT_WLEN), either weight or number of subkeys
361   REFERENCE - rec_reflength bytes, pointer to the record
362 
363   returns 1 if the search was finished (must-word wasn't found)
364 */
_ft2_search_no_lock(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)365 static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
366 {
367   int r;
368   int subkeys=1;
369   my_bool can_go_down;
370   MI_INFO *info=ftb->info;
371   uint UNINIT_VAR(off), extra= HA_FT_WLEN + info->s->rec_reflength;
372   uchar *lastkey_buf=ftbw->word+ftbw->off;
373   uint max_word_length= (ftbw->flags & FTB_FLAG_TRUNC) ? MI_MAX_KEY_BUFF :
374                         ((ftbw->len) * ftb->charset->mbmaxlen) + extra;
375 
376   if (ftbw->flags & FTB_FLAG_TRUNC)
377     lastkey_buf+=ftbw->len;
378 
379   if (init_search)
380   {
381     ftbw->key_root=info->s->state.key_root[ftb->keynr];
382     ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
383 
384     r=_mi_search(info, ftbw->keyinfo, (uchar*) ftbw->word, ftbw->len,
385                  SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
386   }
387   else
388   {
389     uint sflag= SEARCH_BIGGER;
390     my_off_t max_docid=0;
391     FTB_EXPR *tmp;
392 
393     for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
394       set_if_bigger(max_docid, tmp->max_docid);
395 
396     if (ftbw->docid[0] < max_docid)
397     {
398       sflag|= SEARCH_SAME;
399       _mi_dpointer(info, (uchar*) (lastkey_buf + HA_FT_WLEN +
400                                    (ftbw->off ? 0 : lastkey_buf[0] + 1)),
401                    max_docid);
402     }
403     r=_mi_search(info, ftbw->keyinfo, (uchar*) lastkey_buf,
404                    USE_WHOLE_KEY, sflag, ftbw->key_root);
405   }
406 
407   can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
408   /* Skip rows inserted by concurrent insert */
409   while (!r)
410   {
411     if (can_go_down)
412     {
413       /* going down ? */
414       off=info->lastkey_length-extra;
415       subkeys=ft_sintXkorr(info->lastkey+off);
416     }
417     if (subkeys<0 || info->lastpos < info->state->data_file_length)
418       break;
419     r= _mi_search_next(info, ftbw->keyinfo, info->lastkey,
420                        info->lastkey_length,
421 		       SEARCH_BIGGER, ftbw->key_root);
422   }
423 
424   if (!r && !ftbw->off)
425   {
426     r= ha_compare_text(ftb->charset,
427                        info->lastkey+1,
428                        info->lastkey_length-extra-1,
429               (uchar*) ftbw->word+1,
430                        ftbw->len-1,
431              (my_bool) (ftbw->flags & FTB_FLAG_TRUNC),0);
432   }
433 
434   if (r || info->lastkey_length > max_word_length) /* not found */
435   {
436     if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
437     {
438       ftbw->docid[0]=HA_OFFSET_ERROR;
439       if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
440       {
441         /*
442           This word MUST BE present in every document returned,
443           so we can stop the search right now
444         */
445         ftb->state=INDEX_DONE;
446         return 1; /* search is done */
447       }
448       else
449         return 0;
450     }
451 
452     /*
453       Going up to the first-level tree to continue search there.
454       Only done when performing prefix search.
455 
456       Key buffer data pointer as well as docid[0] may be smaller
457       than values we got while searching first-level tree. Thus
458       they must be restored to original values to avoid dead-loop,
459       when subsequent search for a bigger value eventually ends up
460       in this same second-level tree.
461     */
462     _mi_dpointer(info, (uchar*) (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
463     ftbw->docid[0]= ftbw->key_root;
464     ftbw->key_root=info->s->state.key_root[ftb->keynr];
465     ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
466     ftbw->off=0;
467     return _ft2_search_no_lock(ftb, ftbw, 0);
468   }
469 
470   /* matching key found */
471   memcpy(lastkey_buf, info->lastkey, info->lastkey_length);
472   if (lastkey_buf == ftbw->word)
473     ftbw->len=info->lastkey_length-extra;
474 
475   /* going down ? */
476   if (subkeys<0)
477   {
478     /*
479       yep, going down, to the second-level tree
480       TODO here: subkey-based optimization
481     */
482     ftbw->off=off;
483     ftbw->key_root=info->lastpos;
484     ftbw->keyinfo=& info->s->ft2_keyinfo;
485     r=_mi_search_first(info, ftbw->keyinfo, ftbw->key_root);
486     DBUG_ASSERT(r==0);  /* found something */
487     memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length);
488   }
489   ftbw->docid[0]=info->lastpos;
490   if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
491     ftbw->max_docid_expr->max_docid= info->lastpos;
492   return 0;
493 }
494 
_ft2_search(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)495 static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
496 {
497   int r;
498   MYISAM_SHARE *share= ftb->info->s;
499   if (share->concurrent_insert)
500     mysql_rwlock_rdlock(&share->key_root_lock[ftb->keynr]);
501   r= _ft2_search_no_lock(ftb, ftbw, init_search);
502   if (share->concurrent_insert)
503     mysql_rwlock_unlock(&share->key_root_lock[ftb->keynr]);
504   return r;
505 }
506 
_ftb_init_index_search(FT_INFO * ftb)507 static void _ftb_init_index_search(FT_INFO *ftb)
508 {
509   int i;
510   FTB_WORD   *ftbw;
511 
512   if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
513     return;
514   ftb->state=INDEX_SEARCH;
515 
516   for (i=ftb->queue.elements; i; i--)
517   {
518     ftbw=(FTB_WORD *)(ftb->queue.root[i]);
519 
520     if (ftbw->flags & FTB_FLAG_TRUNC)
521     {
522       /*
523         special treatment for truncation operator
524         1. there are some (besides this) +words
525            | no need to search in the index, it can never ADD new rows
526            | to the result, and to remove half-matched rows we do scan anyway
527         2. -trunc*
528            | same as 1.
529         3. in 1 and 2, +/- need not be on the same expr. level,
530            but can be on any upper level, as in +word +(trunc1* trunc2*)
531         4. otherwise
532            | We have to index-search for this prefix.
533            | It may cause duplicates, as in the index (sorted by <word,docid>)
534            |   <aaaa,row1>
535            |   <aabb,row2>
536            |   <aacc,row1>
537            | Searching for "aa*" will find row1 twice...
538       */
539       FTB_EXPR *ftbe;
540       for (ftbe=(FTB_EXPR*)ftbw;
541            ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
542            ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
543       {
544         if (ftbe->flags & FTB_FLAG_NO ||                     /* 2 */
545             ftbe->up->ythresh - ftbe->up->yweaks >
546             (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES))      /* 1 */
547         {
548           FTB_EXPR *top_ftbe=ftbe->up;
549           ftbw->docid[0]=HA_OFFSET_ERROR;
550           for (ftbe=(FTB_EXPR *)ftbw;
551                ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
552                ftbe=ftbe->up)
553               ftbe->up->yweaks++;
554           ftbe=0;
555           break;
556         }
557       }
558       if (!ftbe)
559         continue;
560       /* 4 */
561       if (!is_tree_inited(& ftb->no_dupes))
562         init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
563             _ftb_no_dupes_cmp,0,0,0);
564       else
565         reset_tree(& ftb->no_dupes);
566     }
567 
568     ftbw->off=0; /* in case of reinit */
569     if (_ft2_search(ftb, ftbw, 1))
570       return;
571   }
572   queue_fix(& ftb->queue);
573 }
574 
575 
ft_init_boolean_search(MI_INFO * info,uint keynr,uchar * query,uint query_len,const CHARSET_INFO * cs)576 FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
577                                  uint query_len, const CHARSET_INFO *cs)
578 {
579   FTB       *ftb;
580   FTB_EXPR  *ftbe;
581   FTB_WORD  *ftbw;
582 
583   if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
584     return 0;
585   ftb->please= (struct _ft_vft *) & _ft_vft_boolean;
586   ftb->state=UNINITIALIZED;
587   ftb->info=info;
588   ftb->keynr=keynr;
589   ftb->charset=cs;
590   DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
591   ftb->with_scan=0;
592   ftb->lastpos=HA_OFFSET_ERROR;
593   memset(&ftb->no_dupes, 0, sizeof(TREE));
594   ftb->last_word= 0;
595 
596   init_alloc_root(&ftb->mem_root, 1024, 1024);
597   ftb->queue.max_elements= 0;
598   if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
599     goto err;
600   ftbe->weight=1;
601   ftbe->flags=FTB_FLAG_YES;
602   ftbe->nos=1;
603   ftbe->up=0;
604   ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
605   ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
606   ftbe->phrase= NULL;
607   ftbe->document= 0;
608   ftb->root=ftbe;
609   if (unlikely(_ftb_parse_query(ftb, query, query_len,
610                                 keynr == NO_SUCH_KEY ? &ft_default_parser :
611                                 info->s->keyinfo[keynr].parser)))
612     goto err;
613   /*
614     Hack: instead of init_queue, we'll use reinit queue to be able
615     to alloc queue with alloc_root()
616   */
617   if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
618                                               (ftb->queue.max_elements + 1) *
619                                               sizeof(void *))))
620     goto err;
621   reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
622                          (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
623   for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
624     queue_insert(&ftb->queue, (uchar *)ftbw);
625   ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
626                                      sizeof(FTB_WORD *)*ftb->queue.elements);
627   memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
628   my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
629             (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
630   if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
631   ftb->state=READY;
632   return ftb;
633 err:
634   free_root(& ftb->mem_root, MYF(0));
635   my_free(ftb);
636   return 0;
637 }
638 
639 
640 typedef struct st_my_ftb_phrase_param
641 {
642   LIST *phrase;
643   LIST *document;
644   const CHARSET_INFO *cs;
645   uint phrase_length;
646   uint document_length;
647   uint match;
648 } MY_FTB_PHRASE_PARAM;
649 
650 
ftb_phrase_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))651 static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
652                                char *word, int word_len,
653     MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
654 {
655   MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
656   FT_WORD *w= (FT_WORD *)phrase_param->document->data;
657   LIST *phrase, *document;
658   w->pos= (uchar*) word;
659   w->len= word_len;
660   phrase_param->document= phrase_param->document->prev;
661   if (phrase_param->phrase_length > phrase_param->document_length)
662   {
663     phrase_param->document_length++;
664     return 0;
665   }
666   /* TODO: rewrite phrase search to avoid
667      comparing the same word twice. */
668   for (phrase= phrase_param->phrase, document= phrase_param->document->next;
669        phrase; phrase= phrase->next, document= document->next)
670   {
671     FT_WORD *phrase_word= (FT_WORD *)phrase->data;
672     FT_WORD *document_word= (FT_WORD *)document->data;
673     if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
674                      phrase_word->len,
675                      (uchar*) document_word->pos, document_word->len))
676       return 0;
677   }
678   phrase_param->match++;
679   return 0;
680 }
681 
682 
ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM * param,char * document,int len)683 static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
684                                      char *document, int len)
685 {
686   FT_WORD word;
687   MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
688   const uchar *docend= (uchar*) document + len;
689   while (ft_simple_get_word(phrase_param->cs, (uchar**) &document, docend,
690                             &word, FALSE))
691   {
692     param->mysql_add_word(param, (char*) word.pos, word.len, 0);
693     if (phrase_param->match)
694       break;
695   }
696   return 0;
697 }
698 
699 
700 /*
701   Checks if given buffer matches phrase list.
702 
703   SYNOPSIS
704     _ftb_check_phrase()
705     s0     start of buffer
706     e0     end of buffer
707     phrase broken into list phrase
708     cs     charset info
709 
710   RETURN VALUE
711     1 is returned if phrase found, 0 else.
712     -1 is returned if error occurs.
713 */
714 
_ftb_check_phrase(FTB * ftb,const uchar * document,uint len,FTB_EXPR * ftbe,struct st_mysql_ftparser * parser)715 static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
716                 FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
717 {
718   MY_FTB_PHRASE_PARAM ftb_param;
719   MYSQL_FTPARSER_PARAM *param;
720   DBUG_ENTER("_ftb_check_phrase");
721   DBUG_ASSERT(parser);
722 
723   if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
724     DBUG_RETURN(0);
725 
726   ftb_param.phrase= ftbe->phrase;
727   ftb_param.document= ftbe->document;
728   ftb_param.cs= ftb->charset;
729   ftb_param.phrase_length= list_length(ftbe->phrase);
730   ftb_param.document_length= 1;
731   ftb_param.match= 0;
732 
733   param->mysql_parse= ftb_check_phrase_internal;
734   param->mysql_add_word= ftb_phrase_add_word;
735   param->mysql_ftparam= (void *)&ftb_param;
736   param->cs= ftb->charset;
737   param->doc= (char *) document;
738   param->length= len;
739   param->flags= 0;
740   param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
741   if (unlikely(parser->parse(param)))
742     return -1;
743   DBUG_RETURN(ftb_param.match ? 1 : 0);
744 }
745 
746 
_ftb_climb_the_tree(FTB * ftb,FTB_WORD * ftbw,FT_SEG_ITERATOR * ftsi_orig)747 static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
748 {
749   FT_SEG_ITERATOR ftsi;
750   FTB_EXPR *ftbe;
751   float weight=ftbw->weight;
752   int  yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
753   my_off_t curdoc=ftbw->docid[mode];
754   struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
755                                     &ft_default_parser :
756                                     ftb->info->s->keyinfo[ftb->keynr].parser;
757 
758   for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
759   {
760     ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
761     if (ftbe->docid[mode] != curdoc)
762     {
763       ftbe->cur_weight=0;
764       ftbe->yesses=ftbe->nos=0;
765       ftbe->docid[mode]=curdoc;
766     }
767     if (ftbe->nos)
768       break;
769     if (yn_flag & FTB_FLAG_YES)
770     {
771       weight /= ftbe->ythresh;
772       ftbe->cur_weight += weight;
773       if ((int) ++ftbe->yesses == ythresh)
774       {
775         yn_flag=ftbe->flags;
776         weight=ftbe->cur_weight*ftbe->weight;
777         if (mode && ftbe->phrase)
778         {
779           int found= 0;
780 
781           memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
782           while (_mi_ft_segiterator(&ftsi) && !found)
783           {
784             if (!ftsi.pos)
785               continue;
786             found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
787             if (unlikely(found < 0))
788               return 1;
789           }
790           if (!found)
791             break;
792         } /* ftbe->quot */
793       }
794       else
795         break;
796     }
797     else
798     if (yn_flag & FTB_FLAG_NO)
799     {
800       /*
801         NOTE: special sort function of queue assures that all
802         (yn_flag & FTB_FLAG_NO) != 0
803         events for every particular subexpression will
804         "auto-magically" happen BEFORE all the
805         (yn_flag & FTB_FLAG_YES) != 0 events. So no
806         already matched expression can become not-matched again.
807       */
808       ++ftbe->nos;
809       break;
810     }
811     else
812     {
813       if (ftbe->ythresh)
814         weight/=3;
815       ftbe->cur_weight +=  weight;
816       if ((int) ftbe->yesses < ythresh)
817         break;
818       if (!(yn_flag & FTB_FLAG_WONLY))
819         yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
820       weight*= ftbe->weight;
821     }
822   }
823   return 0;
824 }
825 
826 
ft_boolean_read_next(FT_INFO * ftb,char * record)827 int ft_boolean_read_next(FT_INFO *ftb, char *record)
828 {
829   FTB_EXPR  *ftbe;
830   FTB_WORD  *ftbw;
831   MI_INFO   *info=ftb->info;
832   my_off_t   curdoc;
833 
834   if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
835     return -1;
836 
837   /* black magic ON */
838   if ((int) _mi_check_index(info, ftb->keynr) < 0)
839     return my_errno;
840   if (_mi_readinfo(info, F_RDLCK, 1))
841     return my_errno;
842   /* black magic OFF */
843 
844   if (!ftb->queue.elements)
845     return my_errno=HA_ERR_END_OF_FILE;
846 
847   /* Attention!!! Address of a local variable is used here! See err: label */
848   ftb->queue.first_cmp_arg=(void *)&curdoc;
849 
850   while (ftb->state == INDEX_SEARCH &&
851          (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
852          HA_OFFSET_ERROR)
853   {
854     while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
855     {
856       if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
857       {
858         my_errno= HA_ERR_OUT_OF_MEM;
859         goto err;
860       }
861 
862       /* update queue */
863       _ft2_search(ftb, ftbw, 0);
864       queue_replaced(& ftb->queue);
865     }
866 
867     ftbe=ftb->root;
868     if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
869         ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
870     {
871       /* curdoc matched ! */
872       if (is_tree_inited(&ftb->no_dupes) &&
873           tree_insert(&ftb->no_dupes, &curdoc, 0,
874                       ftb->no_dupes.custom_arg)->count >1)
875         /* but it managed already to get past this line once */
876         continue;
877 
878       info->lastpos=curdoc;
879       /* Clear all states, except that the table was updated */
880       info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
881 
882       if (!(*info->read_record)(info,curdoc, (uchar*) record))
883       {
884         info->update|= HA_STATE_AKTIV;          /* Record is read */
885         if (ftb->with_scan &&
886             ft_boolean_find_relevance(ftb,(uchar*) record,0)==0)
887             continue; /* no match */
888         my_errno=0;
889         goto err;
890       }
891       goto err;
892     }
893   }
894   ftb->state=INDEX_DONE;
895   my_errno=HA_ERR_END_OF_FILE;
896 err:
897   ftb->queue.first_cmp_arg=(void *)0;
898   return my_errno;
899 }
900 
901 
902 typedef struct st_my_ftb_find_param
903 {
904   FT_INFO *ftb;
905   FT_SEG_ITERATOR *ftsi;
906 } MY_FTB_FIND_PARAM;
907 
908 
ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))909 static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
910                                        char *word, int len,
911              MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
912 {
913   MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
914   FT_INFO *ftb= ftb_param->ftb;
915   FTB_WORD *ftbw;
916   int a, b, c;
917   /*
918     Find right-most element in the array of query words matching this
919     word from a document.
920   */
921   for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
922   {
923     ftbw= ftb->list[c];
924     if (ha_compare_text(ftb->charset, (uchar*)word, len,
925                         (uchar*)ftbw->word+1, ftbw->len-1,
926                         (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0) < 0)
927       b= c;
928     else
929       a= c;
930   }
931   /*
932     If there were no words with truncation operator, we iterate to the
933     beginning of an array until array element is equal to the word from
934     a document. This is done mainly because the same word may be
935     mentioned twice (or more) in the query.
936 
937     In case query has words with truncation operator we must iterate
938     to the beginning of the array. There may be non-matching query words
939     between matching word with truncation operator and the right-most
940     matching element. E.g., if we're looking for 'aaa15' in an array of
941     'aaa1* aaa14 aaa15 aaa16'.
942 
943     Worse of that there still may be match even if the binary search
944     above didn't find matching element. E.g., if we're looking for
945     'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
946     stop at 'aaa16'.
947   */
948   for (; c >= 0; c--)
949   {
950     ftbw= ftb->list[c];
951     if (ha_compare_text(ftb->charset, (uchar*)word, len,
952                         (uchar*)ftbw->word + 1,ftbw->len - 1,
953                         (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
954     {
955       if (ftb->with_scan & FTB_FLAG_TRUNC)
956         continue;
957       else
958         break;
959     }
960     if (ftbw->docid[1] == ftb->info->lastpos)
961       continue;
962     ftbw->docid[1]= ftb->info->lastpos;
963     if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
964       return 1;
965   }
966   return(0);
967 }
968 
969 
ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM * param,char * doc,int len)970 static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
971                                     char *doc, int len)
972 {
973   MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
974   FT_INFO *ftb= ftb_param->ftb;
975   uchar *end= (uchar*) doc + len;
976   FT_WORD w;
977   while (ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE))
978     param->mysql_add_word(param, (char*) w.pos, w.len, 0);
979   return(0);
980 }
981 
982 
ft_boolean_find_relevance(FT_INFO * ftb,uchar * record,uint length)983 float ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
984 {
985   FTB_EXPR *ftbe;
986   FT_SEG_ITERATOR ftsi, ftsi2;
987   my_off_t  docid=ftb->info->lastpos;
988   MY_FTB_FIND_PARAM ftb_param;
989   MYSQL_FTPARSER_PARAM *param;
990   struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
991                                     &ft_default_parser :
992                                     ftb->info->s->keyinfo[ftb->keynr].parser;
993 
994   if (docid == HA_OFFSET_ERROR)
995     return -2.0;
996   if (!ftb->queue.elements)
997     return 0;
998   if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
999     return 0;
1000 
1001   if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
1002   {
1003     FTB_EXPR *x;
1004     uint i;
1005 
1006     for (i=0; i < ftb->queue.elements; i++)
1007     {
1008       ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
1009       for (x=ftb->list[i]->up; x; x=x->up)
1010         x->docid[1]=HA_OFFSET_ERROR;
1011     }
1012   }
1013 
1014   ftb->lastpos=docid;
1015 
1016   if (ftb->keynr==NO_SUCH_KEY)
1017     _mi_ft_segiterator_dummy_init(record, length, &ftsi);
1018   else
1019     _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
1020   memcpy(&ftsi2, &ftsi, sizeof(ftsi));
1021 
1022   ftb_param.ftb= ftb;
1023   ftb_param.ftsi= &ftsi2;
1024   param->mysql_parse= ftb_find_relevance_parse;
1025   param->mysql_add_word= ftb_find_relevance_add_word;
1026   param->mysql_ftparam= (void *)&ftb_param;
1027   param->flags= 0;
1028   param->cs= ftb->charset;
1029   param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
1030   while (_mi_ft_segiterator(&ftsi))
1031   {
1032     if (!ftsi.pos)
1033       continue;
1034     param->doc= (char *)ftsi.pos;
1035     param->length= ftsi.len;
1036     if (unlikely(parser->parse(param)))
1037       return 0;
1038   }
1039   ftbe=ftb->root;
1040   if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
1041       ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
1042   { /* row matched ! */
1043     return ftbe->cur_weight;
1044   }
1045   else
1046   { /* match failed ! */
1047     return 0.0;
1048   }
1049 }
1050 
1051 
ft_boolean_close_search(FT_INFO * ftb)1052 void ft_boolean_close_search(FT_INFO *ftb)
1053 {
1054   if (is_tree_inited(& ftb->no_dupes))
1055   {
1056     delete_tree(& ftb->no_dupes);
1057   }
1058   free_root(& ftb->mem_root, MYF(0));
1059   my_free(ftb);
1060 }
1061 
1062 
ft_boolean_get_relevance(FT_INFO * ftb)1063 float ft_boolean_get_relevance(FT_INFO *ftb)
1064 {
1065   return ftb->root->cur_weight;
1066 }
1067 
1068 
ft_boolean_reinit_search(FT_INFO * ftb)1069 void ft_boolean_reinit_search(FT_INFO *ftb)
1070 {
1071   _ftb_init_index_search(ftb);
1072 }
1073 
1074