1 /* Copyright (c) 2001, 2021, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
24
25 /* TODO: add caching - pre-read several index entries at once */
26
27 /*
28 Added optimization for full-text queries with plus-words. It was
29 implemented by sharing maximal document id (max_docid) variable
30 inside plus subtree. max_docid could be used by any word in plus
31 subtree, but it could be updated by plus-word only.
32
33 Fulltext "smarter index merge" optimization assumes that rows
34 it gets are ordered by doc_id. That is not the case when we
35 search for a word with truncation operator. It may return
36 rows in random order. Thus we may not use "smarter index merge"
37 optimization with "trunc-words".
38
39 The idea is: there is no need to search for docid smaller than
40 biggest docid inside current plus subtree or any upper plus subtree.
41
42 Examples:
43 +word1 word2
44 share same max_docid
45 max_docid updated by word1
46 +word1 +(word2 word3)
47 share same max_docid
48 max_docid updated by word1
49 +(word1 -word2) +(+word3 word4)
50 share same max_docid
51 max_docid updated by word3
52 +word1 word2 (+word3 word4 (+word5 word6))
53 three subexpressions (including the top-level one),
54 every one has its own max_docid, updated by its plus word.
55 but for the search word6 uses
56 max(word1.max_docid, word3.max_docid, word5.max_docid),
57 while word4 uses, accordingly,
58 max(word1.max_docid, word3.max_docid).
59 */
60
61 #define FT_CORE
62 #include "ftdefs.h"
63
64 /* search with boolean queries */
65
66 static double _wghts[11]=
67 {
68 0.131687242798354,
69 0.197530864197531,
70 0.296296296296296,
71 0.444444444444444,
72 0.666666666666667,
73 1.000000000000000,
74 1.500000000000000,
75 2.250000000000000,
76 3.375000000000000,
77 5.062500000000000,
78 7.593750000000000};
79 static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
80
81 static double _nwghts[11]=
82 {
83 -0.065843621399177,
84 -0.098765432098766,
85 -0.148148148148148,
86 -0.222222222222222,
87 -0.333333333333334,
88 -0.500000000000000,
89 -0.750000000000000,
90 -1.125000000000000,
91 -1.687500000000000,
92 -2.531250000000000,
93 -3.796875000000000};
94 static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
95
96 #define FTB_FLAG_TRUNC 1
97 /* At most one of the following flags can be set */
98 #define FTB_FLAG_YES 2
99 #define FTB_FLAG_NO 4
100 #define FTB_FLAG_WONLY 8
101
102 #define CMP_NUM(a,b) (((a) < (b)) ? -1 : ((a) == (b)) ? 0 : 1)
103
104 typedef struct st_ftb_expr FTB_EXPR;
105 struct st_ftb_expr
106 {
107 FTB_EXPR *up;
108 uint flags;
109 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
110 my_off_t docid[2];
111 my_off_t max_docid;
112 float weight;
113 float cur_weight;
114 LIST *phrase; /* phrase words */
115 LIST *document; /* for phrase search */
116 uint yesses; /* number of "yes" words matched */
117 uint nos; /* number of "no" words matched */
118 uint ythresh; /* number of "yes" words in expr */
119 uint yweaks; /* number of "yes" words for scan only */
120 };
121
122 typedef struct st_ftb_word
123 {
124 FTB_EXPR *up;
125 uint flags;
126 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
127 my_off_t docid[2]; /* for index search and for scan */
128 my_off_t key_root;
129 FTB_EXPR *max_docid_expr;
130 MI_KEYDEF *keyinfo;
131 struct st_ftb_word *prev;
132 float weight;
133 uint ndepth;
134 uint len;
135 uchar off;
136 uchar word[1];
137 } FTB_WORD;
138
139 typedef struct st_ft_info
140 {
141 struct _ft_vft *please;
142 MI_INFO *info;
143 const CHARSET_INFO *charset;
144 FTB_EXPR *root;
145 FTB_WORD **list;
146 FTB_WORD *last_word;
147 MEM_ROOT mem_root;
148 QUEUE queue;
149 TREE no_dupes;
150 my_off_t lastpos;
151 uint keynr;
152 uchar with_scan;
153 enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
154 } FTB;
155
FTB_WORD_cmp(my_off_t * v,FTB_WORD * a,FTB_WORD * b)156 static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
157 {
158 int i;
159
160 /* if a==curdoc, take it as a < b */
161 if (v && a->docid[0] == *v)
162 return -1;
163
164 /* ORDER BY docid, ndepth DESC */
165 i=CMP_NUM(a->docid[0], b->docid[0]);
166 if (!i)
167 i=CMP_NUM(b->ndepth,a->ndepth);
168 return i;
169 }
170
FTB_WORD_cmp_list(CHARSET_INFO * cs,FTB_WORD ** a,FTB_WORD ** b)171 static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
172 {
173 /* ORDER BY word, ndepth */
174 int i= ha_compare_text(cs, (uchar*) (*a)->word + 1, (*a)->len - 1,
175 (uchar*) (*b)->word + 1, (*b)->len - 1, 0, 0);
176 if (!i)
177 i= CMP_NUM((*a)->ndepth, (*b)->ndepth);
178 return i;
179 }
180
181
182 typedef struct st_my_ftb_param
183 {
184 FTB *ftb;
185 FTB_EXPR *ftbe;
186 uchar *up_quot;
187 uint depth;
188 } MY_FTB_PARAM;
189
190
ftb_query_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * info)191 static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
192 char *word, int word_len,
193 MYSQL_FTPARSER_BOOLEAN_INFO *info)
194 {
195 MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
196 FTB_WORD *ftbw;
197 FTB_EXPR *ftbe, *tmp_expr;
198 FT_WORD *phrase_word;
199 LIST *tmp_element;
200 int r= info->weight_adjust;
201 float weight= (float)
202 (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
203
204 switch (info->type) {
205 case FT_TOKEN_WORD:
206 ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
207 sizeof(FTB_WORD) +
208 (info->trunc ? MI_MAX_KEY_BUFF :
209 (word_len + 1) *
210 ftb_param->ftb->charset->mbmaxlen +
211 HA_FT_WLEN +
212 ftb_param->ftb->info->s->rec_reflength));
213 ftbw->len= word_len + 1;
214 ftbw->flags= 0;
215 ftbw->off= 0;
216 if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
217 if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
218 if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
219 ftbw->weight= weight;
220 ftbw->up= ftb_param->ftbe;
221 ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
222 ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
223 ftbw->key_root= HA_OFFSET_ERROR;
224 memcpy(ftbw->word + 1, word, word_len);
225 ftbw->word[0]= word_len;
226 if (info->yesno > 0) ftbw->up->ythresh++;
227 ftb_param->ftb->queue.max_elements++;
228 ftbw->prev= ftb_param->ftb->last_word;
229 ftb_param->ftb->last_word= ftbw;
230 ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
231 for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
232 if (! (tmp_expr->flags & FTB_FLAG_YES))
233 break;
234 ftbw->max_docid_expr= tmp_expr;
235 /* fall through */
236 case FT_TOKEN_STOPWORD:
237 if (! ftb_param->up_quot) break;
238 phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
239 tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
240 phrase_word->pos= (uchar*) word;
241 phrase_word->len= word_len;
242 tmp_element->data= (void *)phrase_word;
243 ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
244 /* Allocate document list at this point.
245 It allows to avoid huge amount of allocs/frees for each row.*/
246 tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
247 tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
248 ftb_param->ftbe->document=
249 list_add(ftb_param->ftbe->document, tmp_element);
250 break;
251 case FT_TOKEN_LEFT_PAREN:
252 ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
253 ftbe->flags= 0;
254 if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
255 if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
256 ftbe->weight= weight;
257 ftbe->up= ftb_param->ftbe;
258 ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
259 ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
260 ftbe->phrase= NULL;
261 ftbe->document= 0;
262 if (info->quot) ftb_param->ftb->with_scan|= 2;
263 if (info->yesno > 0) ftbe->up->ythresh++;
264 ftb_param->ftbe= ftbe;
265 ftb_param->depth++;
266 ftb_param->up_quot= (uchar*) info->quot;
267 break;
268 case FT_TOKEN_RIGHT_PAREN:
269 if (ftb_param->ftbe->document)
270 {
271 /* Circuit document list */
272 for (tmp_element= ftb_param->ftbe->document;
273 tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
274 tmp_element->next= ftb_param->ftbe->document;
275 ftb_param->ftbe->document->prev= tmp_element;
276 }
277 info->quot= 0;
278 if (ftb_param->ftbe->up)
279 {
280 assert(ftb_param->depth);
281 ftb_param->ftbe= ftb_param->ftbe->up;
282 ftb_param->depth--;
283 ftb_param->up_quot= 0;
284 }
285 break;
286 case FT_TOKEN_EOF:
287 default:
288 break;
289 }
290 return(0);
291 }
292
293
ftb_parse_query_internal(MYSQL_FTPARSER_PARAM * param,char * query,int len)294 static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
295 char *query, int len)
296 {
297 MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
298 MYSQL_FTPARSER_BOOLEAN_INFO info;
299 const CHARSET_INFO *cs= ftb_param->ftb->charset;
300 uchar **start= (uchar**) &query;
301 uchar *end= (uchar*) query + len;
302 FT_WORD w;
303
304 info.prev= ' ';
305 info.quot= 0;
306 while (ft_get_word(cs, start, end, &w, &info))
307 param->mysql_add_word(param, (char*) w.pos, w.len, &info);
308 return(0);
309 }
310
311
_ftb_parse_query(FTB * ftb,uchar * query,uint len,struct st_mysql_ftparser * parser)312 static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
313 struct st_mysql_ftparser *parser)
314 {
315 MYSQL_FTPARSER_PARAM *param;
316 MY_FTB_PARAM ftb_param;
317 DBUG_ENTER("_ftb_parse_query");
318 assert(parser);
319
320 if (ftb->state != UNINITIALIZED)
321 DBUG_RETURN(0);
322 if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
323 DBUG_RETURN(1);
324
325 ftb_param.ftb= ftb;
326 ftb_param.depth= 0;
327 ftb_param.ftbe= ftb->root;
328 ftb_param.up_quot= 0;
329
330 param->mysql_parse= ftb_parse_query_internal;
331 param->mysql_add_word= ftb_query_add_word;
332 param->mysql_ftparam= (void *)&ftb_param;
333 param->cs= ftb->charset;
334 param->doc= (char*) query;
335 param->length= len;
336 param->flags= 0;
337 param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
338 DBUG_RETURN(parser->parse(param));
339 }
340
341
_ftb_no_dupes_cmp(const void * not_used MY_ATTRIBUTE ((unused)),const void * a,const void * b)342 static int _ftb_no_dupes_cmp(const void* not_used MY_ATTRIBUTE((unused)),
343 const void *a,const void *b)
344 {
345 return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
346 }
347
348 /*
349 When performing prefix search (a word with truncation operator), we
350 must preserve original prefix to ensure that characters which may be
351 expanded/contracted do not break the prefix. This is done by storing
352 newly found key immediatly after the original word in ftbw->word
353 buffer.
354
355 ftbw->word= LENGTH WORD [ LENGTH1 WORD1 ] WEIGHT REFERENCE
356 LENGTH - 1 byte, length of the WORD
357 WORD - LENGTH bytes, the word itself
358 LENGTH1 - 1 byte, length of the WORD1, present in case of prefix search
359 WORD1 - LENGTH bytes, the word itself, present in case of prefix search
360 WEIGHT - 4 bytes (HA_FT_WLEN), either weight or number of subkeys
361 REFERENCE - rec_reflength bytes, pointer to the record
362
363 returns 1 if the search was finished (must-word wasn't found)
364 */
_ft2_search_no_lock(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)365 static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
366 {
367 int r;
368 int subkeys=1;
369 my_bool can_go_down;
370 MI_INFO *info=ftb->info;
371 uint off= 0, extra= HA_FT_WLEN + info->s->rec_reflength;
372 uchar *lastkey_buf=ftbw->word+ftbw->off;
373 uint max_word_length= (ftbw->flags & FTB_FLAG_TRUNC) ? MI_MAX_KEY_BUFF :
374 ((ftbw->len) * ftb->charset->mbmaxlen) + extra;
375
376 if (ftbw->flags & FTB_FLAG_TRUNC)
377 lastkey_buf+=ftbw->len;
378
379 if (init_search)
380 {
381 ftbw->key_root=info->s->state.key_root[ftb->keynr];
382 ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
383
384 r=_mi_search(info, ftbw->keyinfo, (uchar*) ftbw->word, ftbw->len,
385 SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
386 }
387 else
388 {
389 uint sflag= SEARCH_BIGGER;
390 my_off_t max_docid=0;
391 FTB_EXPR *tmp;
392
393 for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
394 set_if_bigger(max_docid, tmp->max_docid);
395
396 if (ftbw->docid[0] < max_docid)
397 {
398 sflag|= SEARCH_SAME;
399 _mi_dpointer(info, (uchar*) (lastkey_buf + HA_FT_WLEN +
400 (ftbw->off ? 0 : lastkey_buf[0] + 1)),
401 max_docid);
402 }
403 r=_mi_search(info, ftbw->keyinfo, (uchar*) lastkey_buf,
404 USE_WHOLE_KEY, sflag, ftbw->key_root);
405 }
406
407 can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
408 /* Skip rows inserted by concurrent insert */
409 while (!r)
410 {
411 if (can_go_down)
412 {
413 /* going down ? */
414 off=info->lastkey_length-extra;
415 subkeys=ft_sintXkorr(info->lastkey+off);
416 }
417 if (subkeys<0 || info->lastpos < info->state->data_file_length)
418 break;
419 r= _mi_search_next(info, ftbw->keyinfo, info->lastkey,
420 info->lastkey_length,
421 SEARCH_BIGGER, ftbw->key_root);
422 }
423
424 if (!r && !ftbw->off)
425 {
426 r= ha_compare_text(ftb->charset,
427 info->lastkey+1,
428 info->lastkey_length-extra-1,
429 (uchar*) ftbw->word+1,
430 ftbw->len-1,
431 (my_bool) (ftbw->flags & FTB_FLAG_TRUNC),0);
432 }
433
434 if (r || info->lastkey_length > max_word_length) /* not found */
435 {
436 if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
437 {
438 ftbw->docid[0]=HA_OFFSET_ERROR;
439 if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
440 {
441 /*
442 This word MUST BE present in every document returned,
443 so we can stop the search right now
444 */
445 ftb->state=INDEX_DONE;
446 return 1; /* search is done */
447 }
448 else
449 return 0;
450 }
451
452 /*
453 Going up to the first-level tree to continue search there.
454 Only done when performing prefix search.
455
456 Key buffer data pointer as well as docid[0] may be smaller
457 than values we got while searching first-level tree. Thus
458 they must be restored to original values to avoid dead-loop,
459 when subsequent search for a bigger value eventually ends up
460 in this same second-level tree.
461 */
462 _mi_dpointer(info, (uchar*) (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
463 ftbw->docid[0]= ftbw->key_root;
464 ftbw->key_root=info->s->state.key_root[ftb->keynr];
465 ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
466 ftbw->off=0;
467 return _ft2_search_no_lock(ftb, ftbw, 0);
468 }
469
470 /* matching key found */
471 memcpy(lastkey_buf, info->lastkey, info->lastkey_length);
472 if (lastkey_buf == ftbw->word)
473 ftbw->len=info->lastkey_length-extra;
474
475 /* going down ? */
476 if (subkeys<0)
477 {
478 /*
479 yep, going down, to the second-level tree
480 TODO here: subkey-based optimization
481 */
482 ftbw->off=off;
483 ftbw->key_root=info->lastpos;
484 ftbw->keyinfo=& info->s->ft2_keyinfo;
485 r=_mi_search_first(info, ftbw->keyinfo, ftbw->key_root);
486 assert(r==0); /* found something */
487 memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length);
488 }
489 ftbw->docid[0]=info->lastpos;
490 if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
491 ftbw->max_docid_expr->max_docid= info->lastpos;
492 return 0;
493 }
494
_ft2_search(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)495 static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
496 {
497 int r;
498 MYISAM_SHARE *share= ftb->info->s;
499 if (share->concurrent_insert)
500 mysql_rwlock_rdlock(&share->key_root_lock[ftb->keynr]);
501 r= _ft2_search_no_lock(ftb, ftbw, init_search);
502 if (share->concurrent_insert)
503 mysql_rwlock_unlock(&share->key_root_lock[ftb->keynr]);
504 return r;
505 }
506
_ftb_init_index_search(FT_INFO * ftb)507 static void _ftb_init_index_search(FT_INFO *ftb)
508 {
509 int i;
510 FTB_WORD *ftbw;
511
512 if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
513 return;
514 ftb->state=INDEX_SEARCH;
515
516 for (i=ftb->queue.elements; i; i--)
517 {
518 ftbw=(FTB_WORD *)(ftb->queue.root[i]);
519
520 if (ftbw->flags & FTB_FLAG_TRUNC)
521 {
522 /*
523 special treatment for truncation operator
524 1. there are some (besides this) +words
525 | no need to search in the index, it can never ADD new rows
526 | to the result, and to remove half-matched rows we do scan anyway
527 2. -trunc*
528 | same as 1.
529 3. in 1 and 2, +/- need not be on the same expr. level,
530 but can be on any upper level, as in +word +(trunc1* trunc2*)
531 4. otherwise
532 | We have to index-search for this prefix.
533 | It may cause duplicates, as in the index (sorted by <word,docid>)
534 | <aaaa,row1>
535 | <aabb,row2>
536 | <aacc,row1>
537 | Searching for "aa*" will find row1 twice...
538 */
539 FTB_EXPR *ftbe;
540 for (ftbe=(FTB_EXPR*)ftbw;
541 ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
542 ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
543 {
544 if (ftbe->flags & FTB_FLAG_NO || /* 2 */
545 ftbe->up->ythresh - ftbe->up->yweaks >
546 (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES)) /* 1 */
547 {
548 FTB_EXPR *top_ftbe=ftbe->up;
549 ftbw->docid[0]=HA_OFFSET_ERROR;
550 for (ftbe=(FTB_EXPR *)ftbw;
551 ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
552 ftbe=ftbe->up)
553 ftbe->up->yweaks++;
554 ftbe=0;
555 break;
556 }
557 }
558 if (!ftbe)
559 continue;
560 /* 4 */
561 if (!is_tree_inited(& ftb->no_dupes))
562 init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
563 _ftb_no_dupes_cmp,0,0,0);
564 else
565 reset_tree(& ftb->no_dupes);
566 }
567
568 ftbw->off=0; /* in case of reinit */
569 if (_ft2_search(ftb, ftbw, 1))
570 return;
571 }
572 queue_fix(& ftb->queue);
573 }
574
575
ft_init_boolean_search(MI_INFO * info,uint keynr,uchar * query,uint query_len,const CHARSET_INFO * cs)576 FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
577 uint query_len, const CHARSET_INFO *cs)
578 {
579 FTB *ftb;
580 FTB_EXPR *ftbe;
581 FTB_WORD *ftbw;
582
583 if (!(ftb=(FTB *)my_malloc(mi_key_memory_FTB,
584 sizeof(FTB), MYF(MY_WME))))
585 return 0;
586 ftb->please= (struct _ft_vft *) & _ft_vft_boolean;
587 ftb->state=UNINITIALIZED;
588 ftb->info=info;
589 ftb->keynr=keynr;
590 ftb->charset=cs;
591 assert(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
592 ftb->with_scan=0;
593 ftb->lastpos=HA_OFFSET_ERROR;
594 memset(&ftb->no_dupes, 0, sizeof(TREE));
595 ftb->last_word= 0;
596
597 init_alloc_root(PSI_INSTRUMENT_ME, &ftb->mem_root, 1024, 1024);
598 ftb->queue.max_elements= 0;
599 if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
600 goto err;
601 ftbe->weight=1;
602 ftbe->flags=FTB_FLAG_YES;
603 ftbe->nos=1;
604 ftbe->up=0;
605 ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
606 ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
607 ftbe->phrase= NULL;
608 ftbe->document= 0;
609 ftb->root=ftbe;
610 if (unlikely(_ftb_parse_query(ftb, query, query_len,
611 keynr == NO_SUCH_KEY ? &ft_default_parser :
612 info->s->keyinfo[keynr].parser)))
613 goto err;
614 /*
615 Hack: instead of init_queue, we'll use reinit queue to be able
616 to alloc queue with alloc_root()
617 */
618 if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
619 (ftb->queue.max_elements + 1) *
620 sizeof(void *))))
621 goto err;
622 reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
623 (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
624 for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
625 queue_insert(&ftb->queue, (uchar *)ftbw);
626 ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
627 sizeof(FTB_WORD *)*ftb->queue.elements);
628 memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
629 my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
630 (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
631 if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
632 ftb->state=READY;
633 return ftb;
634 err:
635 free_root(& ftb->mem_root, MYF(0));
636 my_free(ftb);
637 return 0;
638 }
639
640
641 typedef struct st_my_ftb_phrase_param
642 {
643 LIST *phrase;
644 LIST *document;
645 const CHARSET_INFO *cs;
646 uint phrase_length;
647 uint document_length;
648 uint match;
649 } MY_FTB_PHRASE_PARAM;
650
651
ftb_phrase_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))652 static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
653 char *word, int word_len,
654 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
655 {
656 MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
657 FT_WORD *w= (FT_WORD *)phrase_param->document->data;
658 LIST *phrase, *document;
659 w->pos= (uchar*) word;
660 w->len= word_len;
661 phrase_param->document= phrase_param->document->prev;
662 if (phrase_param->phrase_length > phrase_param->document_length)
663 {
664 phrase_param->document_length++;
665 return 0;
666 }
667 /* TODO: rewrite phrase search to avoid
668 comparing the same word twice. */
669 for (phrase= phrase_param->phrase, document= phrase_param->document->next;
670 phrase; phrase= phrase->next, document= document->next)
671 {
672 FT_WORD *phrase_word= (FT_WORD *)phrase->data;
673 FT_WORD *document_word= (FT_WORD *)document->data;
674 if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
675 phrase_word->len,
676 (uchar*) document_word->pos, document_word->len))
677 return 0;
678 }
679 phrase_param->match++;
680 return 0;
681 }
682
683
ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM * param,char * document,int len)684 static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
685 char *document, int len)
686 {
687 FT_WORD word;
688 MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
689 const uchar *docend= (uchar*) document + len;
690 while (ft_simple_get_word(phrase_param->cs, (uchar**) &document, docend,
691 &word, FALSE))
692 {
693 param->mysql_add_word(param, (char*) word.pos, word.len, 0);
694 if (phrase_param->match)
695 break;
696 }
697 return 0;
698 }
699
700
701 /*
702 Checks if given buffer matches phrase list.
703
704 SYNOPSIS
705 _ftb_check_phrase()
706 s0 start of buffer
707 e0 end of buffer
708 phrase broken into list phrase
709 cs charset info
710
711 RETURN VALUE
712 1 is returned if phrase found, 0 else.
713 -1 is returned if error occurs.
714 */
715
_ftb_check_phrase(FTB * ftb,const uchar * document,uint len,FTB_EXPR * ftbe,struct st_mysql_ftparser * parser)716 static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
717 FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
718 {
719 MY_FTB_PHRASE_PARAM ftb_param;
720 MYSQL_FTPARSER_PARAM *param;
721 DBUG_ENTER("_ftb_check_phrase");
722 assert(parser);
723
724 if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
725 DBUG_RETURN(0);
726
727 ftb_param.phrase= ftbe->phrase;
728 ftb_param.document= ftbe->document;
729 ftb_param.cs= ftb->charset;
730 ftb_param.phrase_length= list_length(ftbe->phrase);
731 ftb_param.document_length= 1;
732 ftb_param.match= 0;
733
734 param->mysql_parse= ftb_check_phrase_internal;
735 param->mysql_add_word= ftb_phrase_add_word;
736 param->mysql_ftparam= (void *)&ftb_param;
737 param->cs= ftb->charset;
738 param->doc= (char *) document;
739 param->length= len;
740 param->flags= 0;
741 param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
742 if (unlikely(parser->parse(param)))
743 DBUG_RETURN(-1);
744 DBUG_RETURN(ftb_param.match ? 1 : 0);
745 }
746
747
_ftb_climb_the_tree(FTB * ftb,FTB_WORD * ftbw,FT_SEG_ITERATOR * ftsi_orig)748 static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
749 {
750 FT_SEG_ITERATOR ftsi;
751 FTB_EXPR *ftbe;
752 float weight=ftbw->weight;
753 int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
754 my_off_t curdoc=ftbw->docid[mode];
755 struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
756 &ft_default_parser :
757 ftb->info->s->keyinfo[ftb->keynr].parser;
758
759 for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
760 {
761 ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
762 if (ftbe->docid[mode] != curdoc)
763 {
764 ftbe->cur_weight=0;
765 ftbe->yesses=ftbe->nos=0;
766 ftbe->docid[mode]=curdoc;
767 }
768 if (ftbe->nos)
769 break;
770 if (yn_flag & FTB_FLAG_YES)
771 {
772 weight /= ftbe->ythresh;
773 ftbe->cur_weight += weight;
774 if ((int) ++ftbe->yesses == ythresh)
775 {
776 yn_flag=ftbe->flags;
777 weight=ftbe->cur_weight*ftbe->weight;
778 if (mode && ftbe->phrase)
779 {
780 int found= 0;
781
782 memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
783 while (_mi_ft_segiterator(&ftsi) && !found)
784 {
785 if (!ftsi.pos)
786 continue;
787 found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
788 if (unlikely(found < 0))
789 return 1;
790 }
791 if (!found)
792 break;
793 } /* ftbe->quot */
794 }
795 else
796 break;
797 }
798 else
799 if (yn_flag & FTB_FLAG_NO)
800 {
801 /*
802 NOTE: special sort function of queue assures that all
803 (yn_flag & FTB_FLAG_NO) != 0
804 events for every particular subexpression will
805 "auto-magically" happen BEFORE all the
806 (yn_flag & FTB_FLAG_YES) != 0 events. So no
807 already matched expression can become not-matched again.
808 */
809 ++ftbe->nos;
810 break;
811 }
812 else
813 {
814 if (ftbe->ythresh)
815 weight/=3;
816 ftbe->cur_weight += weight;
817 if ((int) ftbe->yesses < ythresh)
818 break;
819 if (!(yn_flag & FTB_FLAG_WONLY))
820 yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
821 weight*= ftbe->weight;
822 }
823 }
824 return 0;
825 }
826
827
ft_boolean_read_next(FT_INFO * ftb,char * record)828 int ft_boolean_read_next(FT_INFO *ftb, char *record)
829 {
830 FTB_EXPR *ftbe;
831 FTB_WORD *ftbw;
832 MI_INFO *info=ftb->info;
833 my_off_t curdoc;
834
835 if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
836 return -1;
837
838 /* black magic ON */
839 if ((int) _mi_check_index(info, ftb->keynr) < 0)
840 return my_errno();
841 if (_mi_readinfo(info, F_RDLCK, 1))
842 return my_errno();
843 /* black magic OFF */
844
845 if (!ftb->queue.elements)
846 {
847 set_my_errno(HA_ERR_END_OF_FILE);
848 return HA_ERR_END_OF_FILE;
849 }
850
851 /* Attention!!! Address of a local variable is used here! See err: label */
852 ftb->queue.first_cmp_arg=(void *)&curdoc;
853
854 while (ftb->state == INDEX_SEARCH &&
855 (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
856 HA_OFFSET_ERROR)
857 {
858 while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
859 {
860 if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
861 {
862 set_my_errno(HA_ERR_OUT_OF_MEM);
863 goto err;
864 }
865
866 /* update queue */
867 _ft2_search(ftb, ftbw, 0);
868 queue_replaced(& ftb->queue);
869 }
870
871 ftbe=ftb->root;
872 if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
873 ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
874 {
875 /* curdoc matched ! */
876 if (is_tree_inited(&ftb->no_dupes) &&
877 tree_insert(&ftb->no_dupes, &curdoc, 0,
878 ftb->no_dupes.custom_arg)->count >1)
879 /* but it managed already to get past this line once */
880 continue;
881
882 info->lastpos=curdoc;
883 /* Clear all states, except that the table was updated */
884 info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
885
886 if (!(*info->read_record)(info,curdoc, (uchar*) record))
887 {
888 info->update|= HA_STATE_AKTIV; /* Record is read */
889 if (ftb->with_scan &&
890 ft_boolean_find_relevance(ftb,(uchar*) record,0)==0)
891 continue; /* no match */
892 set_my_errno(0);
893 goto err;
894 }
895 goto err;
896 }
897 }
898 ftb->state=INDEX_DONE;
899 set_my_errno(HA_ERR_END_OF_FILE);
900 err:
901 ftb->queue.first_cmp_arg=(void *)0;
902 return my_errno();
903 }
904
905
906 typedef struct st_my_ftb_find_param
907 {
908 FT_INFO *ftb;
909 FT_SEG_ITERATOR *ftsi;
910 } MY_FTB_FIND_PARAM;
911
912
ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))913 static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
914 char *word, int len,
915 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
916 {
917 MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
918 FT_INFO *ftb= ftb_param->ftb;
919 FTB_WORD *ftbw;
920 int a, b, c;
921 /*
922 Find right-most element in the array of query words matching this
923 word from a document.
924 */
925 for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
926 {
927 ftbw= ftb->list[c];
928 if (ha_compare_text(ftb->charset, (uchar*)word, len,
929 (uchar*)ftbw->word+1, ftbw->len-1,
930 (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0) < 0)
931 b= c;
932 else
933 a= c;
934 }
935 /*
936 If there were no words with truncation operator, we iterate to the
937 beginning of an array until array element is equal to the word from
938 a document. This is done mainly because the same word may be
939 mentioned twice (or more) in the query.
940
941 In case query has words with truncation operator we must iterate
942 to the beginning of the array. There may be non-matching query words
943 between matching word with truncation operator and the right-most
944 matching element. E.g., if we're looking for 'aaa15' in an array of
945 'aaa1* aaa14 aaa15 aaa16'.
946
947 Worse of that there still may be match even if the binary search
948 above didn't find matching element. E.g., if we're looking for
949 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
950 stop at 'aaa16'.
951 */
952 for (; c >= 0; c--)
953 {
954 ftbw= ftb->list[c];
955 if (ha_compare_text(ftb->charset, (uchar*)word, len,
956 (uchar*)ftbw->word + 1,ftbw->len - 1,
957 (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
958 {
959 if (ftb->with_scan & FTB_FLAG_TRUNC)
960 continue;
961 else
962 break;
963 }
964 if (ftbw->docid[1] == ftb->info->lastpos)
965 continue;
966 ftbw->docid[1]= ftb->info->lastpos;
967 if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
968 return 1;
969 }
970 return(0);
971 }
972
973
ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM * param,char * doc,int len)974 static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
975 char *doc, int len)
976 {
977 MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
978 FT_INFO *ftb= ftb_param->ftb;
979 uchar *end= (uchar*) doc + len;
980 FT_WORD w;
981 while (ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE))
982 param->mysql_add_word(param, (char*) w.pos, w.len, 0);
983 return(0);
984 }
985
986
ft_boolean_find_relevance(FT_INFO * ftb,uchar * record,uint length)987 float ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
988 {
989 FTB_EXPR *ftbe;
990 FT_SEG_ITERATOR ftsi, ftsi2;
991 my_off_t docid=ftb->info->lastpos;
992 MY_FTB_FIND_PARAM ftb_param;
993 MYSQL_FTPARSER_PARAM *param;
994 struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
995 &ft_default_parser :
996 ftb->info->s->keyinfo[ftb->keynr].parser;
997
998 if (docid == HA_OFFSET_ERROR)
999 return -2.0;
1000 if (!ftb->queue.elements)
1001 return 0;
1002 if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
1003 return 0;
1004
1005 if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
1006 {
1007 FTB_EXPR *x;
1008 uint i;
1009
1010 for (i=0; i < ftb->queue.elements; i++)
1011 {
1012 ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
1013 for (x=ftb->list[i]->up; x; x=x->up)
1014 x->docid[1]=HA_OFFSET_ERROR;
1015 }
1016 }
1017
1018 ftb->lastpos=docid;
1019
1020 if (ftb->keynr==NO_SUCH_KEY)
1021 _mi_ft_segiterator_dummy_init(record, length, &ftsi);
1022 else
1023 _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
1024 memcpy(&ftsi2, &ftsi, sizeof(ftsi));
1025
1026 ftb_param.ftb= ftb;
1027 ftb_param.ftsi= &ftsi2;
1028 param->mysql_parse= ftb_find_relevance_parse;
1029 param->mysql_add_word= ftb_find_relevance_add_word;
1030 param->mysql_ftparam= (void *)&ftb_param;
1031 param->flags= 0;
1032 param->cs= ftb->charset;
1033 param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
1034 while (_mi_ft_segiterator(&ftsi))
1035 {
1036 if (!ftsi.pos)
1037 continue;
1038 param->doc= (char *)ftsi.pos;
1039 param->length= ftsi.len;
1040 if (unlikely(parser->parse(param)))
1041 return 0;
1042 }
1043 ftbe=ftb->root;
1044 if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
1045 ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
1046 { /* row matched ! */
1047 return ftbe->cur_weight;
1048 }
1049 else
1050 { /* match failed ! */
1051 return 0.0;
1052 }
1053 }
1054
1055
ft_boolean_close_search(FT_INFO * ftb)1056 void ft_boolean_close_search(FT_INFO *ftb)
1057 {
1058 if (is_tree_inited(& ftb->no_dupes))
1059 {
1060 delete_tree(& ftb->no_dupes);
1061 }
1062 free_root(& ftb->mem_root, MYF(0));
1063 my_free(ftb);
1064 }
1065
1066
ft_boolean_get_relevance(FT_INFO * ftb)1067 float ft_boolean_get_relevance(FT_INFO *ftb)
1068 {
1069 return ftb->root->cur_weight;
1070 }
1071
1072
ft_boolean_reinit_search(FT_INFO * ftb)1073 void ft_boolean_reinit_search(FT_INFO *ftb)
1074 {
1075 _ftb_init_index_search(ftb);
1076 }
1077
1078