1 /* Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /* Written by Sergei A. Golubchik, who has a shared copyright to this code */
24
25 /* TODO: add caching - pre-read several index entries at once */
26
27 /*
28 Added optimization for full-text queries with plus-words. It was
29 implemented by sharing maximal document id (max_docid) variable
30 inside plus subtree. max_docid could be used by any word in plus
31 subtree, but it could be updated by plus-word only.
32
33 Fulltext "smarter index merge" optimization assumes that rows
34 it gets are ordered by doc_id. That is not the case when we
35 search for a word with truncation operator. It may return
36 rows in random order. Thus we may not use "smarter index merge"
37 optimization with "trunc-words".
38
39 The idea is: there is no need to search for docid smaller than
40 biggest docid inside current plus subtree or any upper plus subtree.
41
42 Examples:
43 +word1 word2
44 share same max_docid
45 max_docid updated by word1
46 +word1 +(word2 word3)
47 share same max_docid
48 max_docid updated by word1
49 +(word1 -word2) +(+word3 word4)
50 share same max_docid
51 max_docid updated by word3
52 +word1 word2 (+word3 word4 (+word5 word6))
53 three subexpressions (including the top-level one),
54 every one has its own max_docid, updated by its plus word.
55 but for the search word6 uses
56 max(word1.max_docid, word3.max_docid, word5.max_docid),
57 while word4 uses, accordingly,
58 max(word1.max_docid, word3.max_docid).
59 */
60
61 #define FT_CORE
62 #include "ftdefs.h"
63
64 /* search with boolean queries */
65
66 static double _wghts[11]=
67 {
68 0.131687242798354,
69 0.197530864197531,
70 0.296296296296296,
71 0.444444444444444,
72 0.666666666666667,
73 1.000000000000000,
74 1.500000000000000,
75 2.250000000000000,
76 3.375000000000000,
77 5.062500000000000,
78 7.593750000000000};
79 static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
80
81 static double _nwghts[11]=
82 {
83 -0.065843621399177,
84 -0.098765432098766,
85 -0.148148148148148,
86 -0.222222222222222,
87 -0.333333333333334,
88 -0.500000000000000,
89 -0.750000000000000,
90 -1.125000000000000,
91 -1.687500000000000,
92 -2.531250000000000,
93 -3.796875000000000};
94 static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
95
96 #define FTB_FLAG_TRUNC 1
97 /* At most one of the following flags can be set */
98 #define FTB_FLAG_YES 2
99 #define FTB_FLAG_NO 4
100 #define FTB_FLAG_WONLY 8
101
102 #define CMP_NUM(a,b) (((a) < (b)) ? -1 : ((a) == (b)) ? 0 : 1)
103
104 typedef struct st_ftb_expr FTB_EXPR;
105 struct st_ftb_expr
106 {
107 FTB_EXPR *up;
108 uint flags;
109 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
110 my_off_t docid[2];
111 my_off_t max_docid;
112 float weight;
113 float cur_weight;
114 LIST *phrase; /* phrase words */
115 LIST *document; /* for phrase search */
116 uint yesses; /* number of "yes" words matched */
117 uint nos; /* number of "no" words matched */
118 uint ythresh; /* number of "yes" words in expr */
119 uint yweaks; /* number of "yes" words for scan only */
120 };
121
122 typedef struct st_ftb_word
123 {
124 FTB_EXPR *up;
125 uint flags;
126 /* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
127 my_off_t docid[2]; /* for index search and for scan */
128 my_off_t key_root;
129 FTB_EXPR *max_docid_expr;
130 MI_KEYDEF *keyinfo;
131 struct st_ftb_word *prev;
132 float weight;
133 uint ndepth;
134 uint len;
135 uchar off;
136 uchar word[1];
137 } FTB_WORD;
138
139 typedef struct st_ft_info
140 {
141 struct _ft_vft *please;
142 MI_INFO *info;
143 const CHARSET_INFO *charset;
144 FTB_EXPR *root;
145 FTB_WORD **list;
146 FTB_WORD *last_word;
147 MEM_ROOT mem_root;
148 QUEUE queue;
149 TREE no_dupes;
150 my_off_t lastpos;
151 uint keynr;
152 uchar with_scan;
153 enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
154 } FTB;
155
FTB_WORD_cmp(my_off_t * v,FTB_WORD * a,FTB_WORD * b)156 static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
157 {
158 int i;
159
160 /* if a==curdoc, take it as a < b */
161 if (v && a->docid[0] == *v)
162 return -1;
163
164 /* ORDER BY docid, ndepth DESC */
165 i=CMP_NUM(a->docid[0], b->docid[0]);
166 if (!i)
167 i=CMP_NUM(b->ndepth,a->ndepth);
168 return i;
169 }
170
FTB_WORD_cmp_list(CHARSET_INFO * cs,FTB_WORD ** a,FTB_WORD ** b)171 static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
172 {
173 /* ORDER BY word, ndepth */
174 int i= ha_compare_text(cs, (uchar*) (*a)->word + 1, (*a)->len - 1,
175 (uchar*) (*b)->word + 1, (*b)->len - 1, 0, 0);
176 if (!i)
177 i= CMP_NUM((*a)->ndepth, (*b)->ndepth);
178 return i;
179 }
180
181
182 typedef struct st_my_ftb_param
183 {
184 FTB *ftb;
185 FTB_EXPR *ftbe;
186 uchar *up_quot;
187 uint depth;
188 } MY_FTB_PARAM;
189
190
ftb_query_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * info)191 static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
192 char *word, int word_len,
193 MYSQL_FTPARSER_BOOLEAN_INFO *info)
194 {
195 MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
196 FTB_WORD *ftbw;
197 FTB_EXPR *ftbe, *tmp_expr;
198 FT_WORD *phrase_word;
199 LIST *tmp_element;
200 int r= info->weight_adjust;
201 float weight= (float)
202 (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
203
204 switch (info->type) {
205 case FT_TOKEN_WORD:
206 ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
207 sizeof(FTB_WORD) +
208 (info->trunc ? MI_MAX_KEY_BUFF :
209 (word_len + 1) *
210 ftb_param->ftb->charset->mbmaxlen +
211 HA_FT_WLEN +
212 ftb_param->ftb->info->s->rec_reflength));
213 ftbw->len= word_len + 1;
214 ftbw->flags= 0;
215 ftbw->off= 0;
216 if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
217 if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
218 if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
219 ftbw->weight= weight;
220 ftbw->up= ftb_param->ftbe;
221 ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
222 ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
223 ftbw->key_root= HA_OFFSET_ERROR;
224 memcpy(ftbw->word + 1, word, word_len);
225 ftbw->word[0]= word_len;
226 if (info->yesno > 0) ftbw->up->ythresh++;
227 ftb_param->ftb->queue.max_elements++;
228 ftbw->prev= ftb_param->ftb->last_word;
229 ftb_param->ftb->last_word= ftbw;
230 ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
231 for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
232 if (! (tmp_expr->flags & FTB_FLAG_YES))
233 break;
234 ftbw->max_docid_expr= tmp_expr;
235 /* fall through */
236 case FT_TOKEN_STOPWORD:
237 if (! ftb_param->up_quot) break;
238 phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
239 tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
240 phrase_word->pos= (uchar*) word;
241 phrase_word->len= word_len;
242 tmp_element->data= (void *)phrase_word;
243 ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
244 /* Allocate document list at this point.
245 It allows to avoid huge amount of allocs/frees for each row.*/
246 tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
247 tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
248 ftb_param->ftbe->document=
249 list_add(ftb_param->ftbe->document, tmp_element);
250 break;
251 case FT_TOKEN_LEFT_PAREN:
252 ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
253 ftbe->flags= 0;
254 if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
255 if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
256 ftbe->weight= weight;
257 ftbe->up= ftb_param->ftbe;
258 ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
259 ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
260 ftbe->phrase= NULL;
261 ftbe->document= 0;
262 if (info->quot) ftb_param->ftb->with_scan|= 2;
263 if (info->yesno > 0) ftbe->up->ythresh++;
264 ftb_param->ftbe= ftbe;
265 ftb_param->depth++;
266 ftb_param->up_quot= (uchar*) info->quot;
267 break;
268 case FT_TOKEN_RIGHT_PAREN:
269 if (ftb_param->ftbe->document)
270 {
271 /* Circuit document list */
272 for (tmp_element= ftb_param->ftbe->document;
273 tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
274 tmp_element->next= ftb_param->ftbe->document;
275 ftb_param->ftbe->document->prev= tmp_element;
276 }
277 info->quot= 0;
278 if (ftb_param->ftbe->up)
279 {
280 DBUG_ASSERT(ftb_param->depth);
281 ftb_param->ftbe= ftb_param->ftbe->up;
282 ftb_param->depth--;
283 ftb_param->up_quot= 0;
284 }
285 break;
286 case FT_TOKEN_EOF:
287 default:
288 break;
289 }
290 return(0);
291 }
292
293
ftb_parse_query_internal(MYSQL_FTPARSER_PARAM * param,char * query,int len)294 static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
295 char *query, int len)
296 {
297 MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
298 MYSQL_FTPARSER_BOOLEAN_INFO info;
299 const CHARSET_INFO *cs= ftb_param->ftb->charset;
300 uchar **start= (uchar**) &query;
301 uchar *end= (uchar*) query + len;
302 FT_WORD w;
303
304 info.prev= ' ';
305 info.quot= 0;
306 while (ft_get_word(cs, start, end, &w, &info))
307 param->mysql_add_word(param, (char*) w.pos, w.len, &info);
308 return(0);
309 }
310
311
_ftb_parse_query(FTB * ftb,uchar * query,uint len,struct st_mysql_ftparser * parser)312 static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
313 struct st_mysql_ftparser *parser)
314 {
315 MYSQL_FTPARSER_PARAM *param;
316 MY_FTB_PARAM ftb_param;
317 DBUG_ENTER("_ftb_parse_query");
318 DBUG_ASSERT(parser);
319
320 if (ftb->state != UNINITIALIZED)
321 DBUG_RETURN(0);
322 if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
323 DBUG_RETURN(1);
324
325 ftb_param.ftb= ftb;
326 ftb_param.depth= 0;
327 ftb_param.ftbe= ftb->root;
328 ftb_param.up_quot= 0;
329
330 param->mysql_parse= ftb_parse_query_internal;
331 param->mysql_add_word= ftb_query_add_word;
332 param->mysql_ftparam= (void *)&ftb_param;
333 param->cs= ftb->charset;
334 param->doc= (char*) query;
335 param->length= len;
336 param->flags= 0;
337 param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
338 DBUG_RETURN(parser->parse(param));
339 }
340
341
_ftb_no_dupes_cmp(const void * not_used MY_ATTRIBUTE ((unused)),const void * a,const void * b)342 static int _ftb_no_dupes_cmp(const void* not_used MY_ATTRIBUTE((unused)),
343 const void *a,const void *b)
344 {
345 return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
346 }
347
348 /*
349 When performing prefix search (a word with truncation operator), we
350 must preserve original prefix to ensure that characters which may be
351 expanded/contracted do not break the prefix. This is done by storing
352 newly found key immediatly after the original word in ftbw->word
353 buffer.
354
355 ftbw->word= LENGTH WORD [ LENGTH1 WORD1 ] WEIGHT REFERENCE
356 LENGTH - 1 byte, length of the WORD
357 WORD - LENGTH bytes, the word itself
358 LENGTH1 - 1 byte, length of the WORD1, present in case of prefix search
359 WORD1 - LENGTH bytes, the word itself, present in case of prefix search
360 WEIGHT - 4 bytes (HA_FT_WLEN), either weight or number of subkeys
361 REFERENCE - rec_reflength bytes, pointer to the record
362
363 returns 1 if the search was finished (must-word wasn't found)
364 */
_ft2_search_no_lock(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)365 static int _ft2_search_no_lock(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
366 {
367 int r;
368 int subkeys=1;
369 my_bool can_go_down;
370 MI_INFO *info=ftb->info;
371 uint UNINIT_VAR(off), extra= HA_FT_WLEN + info->s->rec_reflength;
372 uchar *lastkey_buf=ftbw->word+ftbw->off;
373 uint max_word_length= (ftbw->flags & FTB_FLAG_TRUNC) ? MI_MAX_KEY_BUFF :
374 ((ftbw->len) * ftb->charset->mbmaxlen) + extra;
375
376 if (ftbw->flags & FTB_FLAG_TRUNC)
377 lastkey_buf+=ftbw->len;
378
379 if (init_search)
380 {
381 ftbw->key_root=info->s->state.key_root[ftb->keynr];
382 ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
383
384 r=_mi_search(info, ftbw->keyinfo, (uchar*) ftbw->word, ftbw->len,
385 SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
386 }
387 else
388 {
389 uint sflag= SEARCH_BIGGER;
390 my_off_t max_docid=0;
391 FTB_EXPR *tmp;
392
393 for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
394 set_if_bigger(max_docid, tmp->max_docid);
395
396 if (ftbw->docid[0] < max_docid)
397 {
398 sflag|= SEARCH_SAME;
399 _mi_dpointer(info, (uchar*) (lastkey_buf + HA_FT_WLEN +
400 (ftbw->off ? 0 : lastkey_buf[0] + 1)),
401 max_docid);
402 }
403 r=_mi_search(info, ftbw->keyinfo, (uchar*) lastkey_buf,
404 USE_WHOLE_KEY, sflag, ftbw->key_root);
405 }
406
407 can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
408 /* Skip rows inserted by concurrent insert */
409 while (!r)
410 {
411 if (can_go_down)
412 {
413 /* going down ? */
414 off=info->lastkey_length-extra;
415 subkeys=ft_sintXkorr(info->lastkey+off);
416 }
417 if (subkeys<0 || info->lastpos < info->state->data_file_length)
418 break;
419 r= _mi_search_next(info, ftbw->keyinfo, info->lastkey,
420 info->lastkey_length,
421 SEARCH_BIGGER, ftbw->key_root);
422 }
423
424 if (!r && !ftbw->off)
425 {
426 r= ha_compare_text(ftb->charset,
427 info->lastkey+1,
428 info->lastkey_length-extra-1,
429 (uchar*) ftbw->word+1,
430 ftbw->len-1,
431 (my_bool) (ftbw->flags & FTB_FLAG_TRUNC),0);
432 }
433
434 if (r || info->lastkey_length > max_word_length) /* not found */
435 {
436 if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
437 {
438 ftbw->docid[0]=HA_OFFSET_ERROR;
439 if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
440 {
441 /*
442 This word MUST BE present in every document returned,
443 so we can stop the search right now
444 */
445 ftb->state=INDEX_DONE;
446 return 1; /* search is done */
447 }
448 else
449 return 0;
450 }
451
452 /*
453 Going up to the first-level tree to continue search there.
454 Only done when performing prefix search.
455
456 Key buffer data pointer as well as docid[0] may be smaller
457 than values we got while searching first-level tree. Thus
458 they must be restored to original values to avoid dead-loop,
459 when subsequent search for a bigger value eventually ends up
460 in this same second-level tree.
461 */
462 _mi_dpointer(info, (uchar*) (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
463 ftbw->docid[0]= ftbw->key_root;
464 ftbw->key_root=info->s->state.key_root[ftb->keynr];
465 ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
466 ftbw->off=0;
467 return _ft2_search_no_lock(ftb, ftbw, 0);
468 }
469
470 /* matching key found */
471 memcpy(lastkey_buf, info->lastkey, info->lastkey_length);
472 if (lastkey_buf == ftbw->word)
473 ftbw->len=info->lastkey_length-extra;
474
475 /* going down ? */
476 if (subkeys<0)
477 {
478 /*
479 yep, going down, to the second-level tree
480 TODO here: subkey-based optimization
481 */
482 ftbw->off=off;
483 ftbw->key_root=info->lastpos;
484 ftbw->keyinfo=& info->s->ft2_keyinfo;
485 r=_mi_search_first(info, ftbw->keyinfo, ftbw->key_root);
486 DBUG_ASSERT(r==0); /* found something */
487 memcpy(lastkey_buf+off, info->lastkey, info->lastkey_length);
488 }
489 ftbw->docid[0]=info->lastpos;
490 if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
491 ftbw->max_docid_expr->max_docid= info->lastpos;
492 return 0;
493 }
494
_ft2_search(FTB * ftb,FTB_WORD * ftbw,my_bool init_search)495 static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
496 {
497 int r;
498 MYISAM_SHARE *share= ftb->info->s;
499 if (share->concurrent_insert)
500 mysql_rwlock_rdlock(&share->key_root_lock[ftb->keynr]);
501 r= _ft2_search_no_lock(ftb, ftbw, init_search);
502 if (share->concurrent_insert)
503 mysql_rwlock_unlock(&share->key_root_lock[ftb->keynr]);
504 return r;
505 }
506
_ftb_init_index_search(FT_INFO * ftb)507 static void _ftb_init_index_search(FT_INFO *ftb)
508 {
509 int i;
510 FTB_WORD *ftbw;
511
512 if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
513 return;
514 ftb->state=INDEX_SEARCH;
515
516 for (i=ftb->queue.elements; i; i--)
517 {
518 ftbw=(FTB_WORD *)(ftb->queue.root[i]);
519
520 if (ftbw->flags & FTB_FLAG_TRUNC)
521 {
522 /*
523 special treatment for truncation operator
524 1. there are some (besides this) +words
525 | no need to search in the index, it can never ADD new rows
526 | to the result, and to remove half-matched rows we do scan anyway
527 2. -trunc*
528 | same as 1.
529 3. in 1 and 2, +/- need not be on the same expr. level,
530 but can be on any upper level, as in +word +(trunc1* trunc2*)
531 4. otherwise
532 | We have to index-search for this prefix.
533 | It may cause duplicates, as in the index (sorted by <word,docid>)
534 | <aaaa,row1>
535 | <aabb,row2>
536 | <aacc,row1>
537 | Searching for "aa*" will find row1 twice...
538 */
539 FTB_EXPR *ftbe;
540 for (ftbe=(FTB_EXPR*)ftbw;
541 ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
542 ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
543 {
544 if (ftbe->flags & FTB_FLAG_NO || /* 2 */
545 ftbe->up->ythresh - ftbe->up->yweaks >
546 (uint) MY_TEST(ftbe->flags & FTB_FLAG_YES)) /* 1 */
547 {
548 FTB_EXPR *top_ftbe=ftbe->up;
549 ftbw->docid[0]=HA_OFFSET_ERROR;
550 for (ftbe=(FTB_EXPR *)ftbw;
551 ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
552 ftbe=ftbe->up)
553 ftbe->up->yweaks++;
554 ftbe=0;
555 break;
556 }
557 }
558 if (!ftbe)
559 continue;
560 /* 4 */
561 if (!is_tree_inited(& ftb->no_dupes))
562 init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
563 _ftb_no_dupes_cmp,0,0,0);
564 else
565 reset_tree(& ftb->no_dupes);
566 }
567
568 ftbw->off=0; /* in case of reinit */
569 if (_ft2_search(ftb, ftbw, 1))
570 return;
571 }
572 queue_fix(& ftb->queue);
573 }
574
575
ft_init_boolean_search(MI_INFO * info,uint keynr,uchar * query,uint query_len,const CHARSET_INFO * cs)576 FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
577 uint query_len, const CHARSET_INFO *cs)
578 {
579 FTB *ftb;
580 FTB_EXPR *ftbe;
581 FTB_WORD *ftbw;
582
583 if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
584 return 0;
585 ftb->please= (struct _ft_vft *) & _ft_vft_boolean;
586 ftb->state=UNINITIALIZED;
587 ftb->info=info;
588 ftb->keynr=keynr;
589 ftb->charset=cs;
590 DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
591 ftb->with_scan=0;
592 ftb->lastpos=HA_OFFSET_ERROR;
593 memset(&ftb->no_dupes, 0, sizeof(TREE));
594 ftb->last_word= 0;
595
596 init_alloc_root(&ftb->mem_root, 1024, 1024);
597 ftb->queue.max_elements= 0;
598 if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
599 goto err;
600 ftbe->weight=1;
601 ftbe->flags=FTB_FLAG_YES;
602 ftbe->nos=1;
603 ftbe->up=0;
604 ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
605 ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
606 ftbe->phrase= NULL;
607 ftbe->document= 0;
608 ftb->root=ftbe;
609 if (unlikely(_ftb_parse_query(ftb, query, query_len,
610 keynr == NO_SUCH_KEY ? &ft_default_parser :
611 info->s->keyinfo[keynr].parser)))
612 goto err;
613 /*
614 Hack: instead of init_queue, we'll use reinit queue to be able
615 to alloc queue with alloc_root()
616 */
617 if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
618 (ftb->queue.max_elements + 1) *
619 sizeof(void *))))
620 goto err;
621 reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
622 (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
623 for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
624 queue_insert(&ftb->queue, (uchar *)ftbw);
625 ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
626 sizeof(FTB_WORD *)*ftb->queue.elements);
627 memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
628 my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
629 (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
630 if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
631 ftb->state=READY;
632 return ftb;
633 err:
634 free_root(& ftb->mem_root, MYF(0));
635 my_free(ftb);
636 return 0;
637 }
638
639
640 typedef struct st_my_ftb_phrase_param
641 {
642 LIST *phrase;
643 LIST *document;
644 const CHARSET_INFO *cs;
645 uint phrase_length;
646 uint document_length;
647 uint match;
648 } MY_FTB_PHRASE_PARAM;
649
650
ftb_phrase_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int word_len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))651 static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
652 char *word, int word_len,
653 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
654 {
655 MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
656 FT_WORD *w= (FT_WORD *)phrase_param->document->data;
657 LIST *phrase, *document;
658 w->pos= (uchar*) word;
659 w->len= word_len;
660 phrase_param->document= phrase_param->document->prev;
661 if (phrase_param->phrase_length > phrase_param->document_length)
662 {
663 phrase_param->document_length++;
664 return 0;
665 }
666 /* TODO: rewrite phrase search to avoid
667 comparing the same word twice. */
668 for (phrase= phrase_param->phrase, document= phrase_param->document->next;
669 phrase; phrase= phrase->next, document= document->next)
670 {
671 FT_WORD *phrase_word= (FT_WORD *)phrase->data;
672 FT_WORD *document_word= (FT_WORD *)document->data;
673 if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
674 phrase_word->len,
675 (uchar*) document_word->pos, document_word->len))
676 return 0;
677 }
678 phrase_param->match++;
679 return 0;
680 }
681
682
ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM * param,char * document,int len)683 static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
684 char *document, int len)
685 {
686 FT_WORD word;
687 MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
688 const uchar *docend= (uchar*) document + len;
689 while (ft_simple_get_word(phrase_param->cs, (uchar**) &document, docend,
690 &word, FALSE))
691 {
692 param->mysql_add_word(param, (char*) word.pos, word.len, 0);
693 if (phrase_param->match)
694 break;
695 }
696 return 0;
697 }
698
699
700 /*
701 Checks if given buffer matches phrase list.
702
703 SYNOPSIS
704 _ftb_check_phrase()
705 s0 start of buffer
706 e0 end of buffer
707 phrase broken into list phrase
708 cs charset info
709
710 RETURN VALUE
711 1 is returned if phrase found, 0 else.
712 -1 is returned if error occurs.
713 */
714
_ftb_check_phrase(FTB * ftb,const uchar * document,uint len,FTB_EXPR * ftbe,struct st_mysql_ftparser * parser)715 static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
716 FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
717 {
718 MY_FTB_PHRASE_PARAM ftb_param;
719 MYSQL_FTPARSER_PARAM *param;
720 DBUG_ENTER("_ftb_check_phrase");
721 DBUG_ASSERT(parser);
722
723 if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
724 DBUG_RETURN(0);
725
726 ftb_param.phrase= ftbe->phrase;
727 ftb_param.document= ftbe->document;
728 ftb_param.cs= ftb->charset;
729 ftb_param.phrase_length= list_length(ftbe->phrase);
730 ftb_param.document_length= 1;
731 ftb_param.match= 0;
732
733 param->mysql_parse= ftb_check_phrase_internal;
734 param->mysql_add_word= ftb_phrase_add_word;
735 param->mysql_ftparam= (void *)&ftb_param;
736 param->cs= ftb->charset;
737 param->doc= (char *) document;
738 param->length= len;
739 param->flags= 0;
740 param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
741 if (unlikely(parser->parse(param)))
742 return -1;
743 DBUG_RETURN(ftb_param.match ? 1 : 0);
744 }
745
746
_ftb_climb_the_tree(FTB * ftb,FTB_WORD * ftbw,FT_SEG_ITERATOR * ftsi_orig)747 static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
748 {
749 FT_SEG_ITERATOR ftsi;
750 FTB_EXPR *ftbe;
751 float weight=ftbw->weight;
752 int yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
753 my_off_t curdoc=ftbw->docid[mode];
754 struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
755 &ft_default_parser :
756 ftb->info->s->keyinfo[ftb->keynr].parser;
757
758 for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
759 {
760 ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
761 if (ftbe->docid[mode] != curdoc)
762 {
763 ftbe->cur_weight=0;
764 ftbe->yesses=ftbe->nos=0;
765 ftbe->docid[mode]=curdoc;
766 }
767 if (ftbe->nos)
768 break;
769 if (yn_flag & FTB_FLAG_YES)
770 {
771 weight /= ftbe->ythresh;
772 ftbe->cur_weight += weight;
773 if ((int) ++ftbe->yesses == ythresh)
774 {
775 yn_flag=ftbe->flags;
776 weight=ftbe->cur_weight*ftbe->weight;
777 if (mode && ftbe->phrase)
778 {
779 int found= 0;
780
781 memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
782 while (_mi_ft_segiterator(&ftsi) && !found)
783 {
784 if (!ftsi.pos)
785 continue;
786 found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
787 if (unlikely(found < 0))
788 return 1;
789 }
790 if (!found)
791 break;
792 } /* ftbe->quot */
793 }
794 else
795 break;
796 }
797 else
798 if (yn_flag & FTB_FLAG_NO)
799 {
800 /*
801 NOTE: special sort function of queue assures that all
802 (yn_flag & FTB_FLAG_NO) != 0
803 events for every particular subexpression will
804 "auto-magically" happen BEFORE all the
805 (yn_flag & FTB_FLAG_YES) != 0 events. So no
806 already matched expression can become not-matched again.
807 */
808 ++ftbe->nos;
809 break;
810 }
811 else
812 {
813 if (ftbe->ythresh)
814 weight/=3;
815 ftbe->cur_weight += weight;
816 if ((int) ftbe->yesses < ythresh)
817 break;
818 if (!(yn_flag & FTB_FLAG_WONLY))
819 yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
820 weight*= ftbe->weight;
821 }
822 }
823 return 0;
824 }
825
826
ft_boolean_read_next(FT_INFO * ftb,char * record)827 int ft_boolean_read_next(FT_INFO *ftb, char *record)
828 {
829 FTB_EXPR *ftbe;
830 FTB_WORD *ftbw;
831 MI_INFO *info=ftb->info;
832 my_off_t curdoc;
833
834 if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
835 return -1;
836
837 /* black magic ON */
838 if ((int) _mi_check_index(info, ftb->keynr) < 0)
839 return my_errno;
840 if (_mi_readinfo(info, F_RDLCK, 1))
841 return my_errno;
842 /* black magic OFF */
843
844 if (!ftb->queue.elements)
845 return my_errno=HA_ERR_END_OF_FILE;
846
847 /* Attention!!! Address of a local variable is used here! See err: label */
848 ftb->queue.first_cmp_arg=(void *)&curdoc;
849
850 while (ftb->state == INDEX_SEARCH &&
851 (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
852 HA_OFFSET_ERROR)
853 {
854 while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
855 {
856 if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
857 {
858 my_errno= HA_ERR_OUT_OF_MEM;
859 goto err;
860 }
861
862 /* update queue */
863 _ft2_search(ftb, ftbw, 0);
864 queue_replaced(& ftb->queue);
865 }
866
867 ftbe=ftb->root;
868 if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
869 ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
870 {
871 /* curdoc matched ! */
872 if (is_tree_inited(&ftb->no_dupes) &&
873 tree_insert(&ftb->no_dupes, &curdoc, 0,
874 ftb->no_dupes.custom_arg)->count >1)
875 /* but it managed already to get past this line once */
876 continue;
877
878 info->lastpos=curdoc;
879 /* Clear all states, except that the table was updated */
880 info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
881
882 if (!(*info->read_record)(info,curdoc, (uchar*) record))
883 {
884 info->update|= HA_STATE_AKTIV; /* Record is read */
885 if (ftb->with_scan &&
886 ft_boolean_find_relevance(ftb,(uchar*) record,0)==0)
887 continue; /* no match */
888 my_errno=0;
889 goto err;
890 }
891 goto err;
892 }
893 }
894 ftb->state=INDEX_DONE;
895 my_errno=HA_ERR_END_OF_FILE;
896 err:
897 ftb->queue.first_cmp_arg=(void *)0;
898 return my_errno;
899 }
900
901
902 typedef struct st_my_ftb_find_param
903 {
904 FT_INFO *ftb;
905 FT_SEG_ITERATOR *ftsi;
906 } MY_FTB_FIND_PARAM;
907
908
ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM * param,char * word,int len,MYSQL_FTPARSER_BOOLEAN_INFO * boolean_info MY_ATTRIBUTE ((unused)))909 static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
910 char *word, int len,
911 MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info MY_ATTRIBUTE((unused)))
912 {
913 MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
914 FT_INFO *ftb= ftb_param->ftb;
915 FTB_WORD *ftbw;
916 int a, b, c;
917 /*
918 Find right-most element in the array of query words matching this
919 word from a document.
920 */
921 for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
922 {
923 ftbw= ftb->list[c];
924 if (ha_compare_text(ftb->charset, (uchar*)word, len,
925 (uchar*)ftbw->word+1, ftbw->len-1,
926 (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0) < 0)
927 b= c;
928 else
929 a= c;
930 }
931 /*
932 If there were no words with truncation operator, we iterate to the
933 beginning of an array until array element is equal to the word from
934 a document. This is done mainly because the same word may be
935 mentioned twice (or more) in the query.
936
937 In case query has words with truncation operator we must iterate
938 to the beginning of the array. There may be non-matching query words
939 between matching word with truncation operator and the right-most
940 matching element. E.g., if we're looking for 'aaa15' in an array of
941 'aaa1* aaa14 aaa15 aaa16'.
942
943 Worse of that there still may be match even if the binary search
944 above didn't find matching element. E.g., if we're looking for
945 'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
946 stop at 'aaa16'.
947 */
948 for (; c >= 0; c--)
949 {
950 ftbw= ftb->list[c];
951 if (ha_compare_text(ftb->charset, (uchar*)word, len,
952 (uchar*)ftbw->word + 1,ftbw->len - 1,
953 (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
954 {
955 if (ftb->with_scan & FTB_FLAG_TRUNC)
956 continue;
957 else
958 break;
959 }
960 if (ftbw->docid[1] == ftb->info->lastpos)
961 continue;
962 ftbw->docid[1]= ftb->info->lastpos;
963 if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
964 return 1;
965 }
966 return(0);
967 }
968
969
ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM * param,char * doc,int len)970 static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
971 char *doc, int len)
972 {
973 MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
974 FT_INFO *ftb= ftb_param->ftb;
975 uchar *end= (uchar*) doc + len;
976 FT_WORD w;
977 while (ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE))
978 param->mysql_add_word(param, (char*) w.pos, w.len, 0);
979 return(0);
980 }
981
982
ft_boolean_find_relevance(FT_INFO * ftb,uchar * record,uint length)983 float ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
984 {
985 FTB_EXPR *ftbe;
986 FT_SEG_ITERATOR ftsi, ftsi2;
987 my_off_t docid=ftb->info->lastpos;
988 MY_FTB_FIND_PARAM ftb_param;
989 MYSQL_FTPARSER_PARAM *param;
990 struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
991 &ft_default_parser :
992 ftb->info->s->keyinfo[ftb->keynr].parser;
993
994 if (docid == HA_OFFSET_ERROR)
995 return -2.0;
996 if (!ftb->queue.elements)
997 return 0;
998 if (! (param= ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
999 return 0;
1000
1001 if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
1002 {
1003 FTB_EXPR *x;
1004 uint i;
1005
1006 for (i=0; i < ftb->queue.elements; i++)
1007 {
1008 ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
1009 for (x=ftb->list[i]->up; x; x=x->up)
1010 x->docid[1]=HA_OFFSET_ERROR;
1011 }
1012 }
1013
1014 ftb->lastpos=docid;
1015
1016 if (ftb->keynr==NO_SUCH_KEY)
1017 _mi_ft_segiterator_dummy_init(record, length, &ftsi);
1018 else
1019 _mi_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
1020 memcpy(&ftsi2, &ftsi, sizeof(ftsi));
1021
1022 ftb_param.ftb= ftb;
1023 ftb_param.ftsi= &ftsi2;
1024 param->mysql_parse= ftb_find_relevance_parse;
1025 param->mysql_add_word= ftb_find_relevance_add_word;
1026 param->mysql_ftparam= (void *)&ftb_param;
1027 param->flags= 0;
1028 param->cs= ftb->charset;
1029 param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
1030 while (_mi_ft_segiterator(&ftsi))
1031 {
1032 if (!ftsi.pos)
1033 continue;
1034 param->doc= (char *)ftsi.pos;
1035 param->length= ftsi.len;
1036 if (unlikely(parser->parse(param)))
1037 return 0;
1038 }
1039 ftbe=ftb->root;
1040 if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
1041 ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
1042 { /* row matched ! */
1043 return ftbe->cur_weight;
1044 }
1045 else
1046 { /* match failed ! */
1047 return 0.0;
1048 }
1049 }
1050
1051
ft_boolean_close_search(FT_INFO * ftb)1052 void ft_boolean_close_search(FT_INFO *ftb)
1053 {
1054 if (is_tree_inited(& ftb->no_dupes))
1055 {
1056 delete_tree(& ftb->no_dupes);
1057 }
1058 free_root(& ftb->mem_root, MYF(0));
1059 my_free(ftb);
1060 }
1061
1062
ft_boolean_get_relevance(FT_INFO * ftb)1063 float ft_boolean_get_relevance(FT_INFO *ftb)
1064 {
1065 return ftb->root->cur_weight;
1066 }
1067
1068
ft_boolean_reinit_search(FT_INFO * ftb)1069 void ft_boolean_reinit_search(FT_INFO *ftb)
1070 {
1071 _ftb_init_index_search(ftb);
1072 }
1073
1074